In [602]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

data = pd.read_csv('train.csv')
data1 = pd.read_csv('test.csv')

In [603]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ID              2059 non-null   int64 
 1   Candidate       2059 non-null   object
 2   Constituency ∇  2059 non-null   object
 3   Party           2059 non-null   object
 4   Criminal Case   2059 non-null   int64 
 5   Total Assets    2059 non-null   object
 6   Liabilities     2059 non-null   object
 7   state           2059 non-null   object
 8   Education       2059 non-null   object
dtypes: int64(2), object(7)
memory usage: 144.9+ KB


In [604]:
print(data.shape)
data.head()

(2059, 9)


Unnamed: 0,ID,Candidate,Constituency ∇,Party,Criminal Case,Total Assets,Liabilities,state,Education
0,0,M.K. Mohan,ANNA NAGAR,DMK,4,211 Crore+,2 Crore+,TAMIL NADU,8th Pass
1,1,Khatik Ramesh Prasad,KARERA (SC),BJP,0,1 Crore+,0,MADHYA PRADESH,12th Pass
2,2,Dr. Mantar Gowda,MADIKERI,INC,0,7 Crore+,22 Lac+,KARNATAKA,Post Graduate
3,3,Kundan Kumar,BEGUSARAI,BJP,0,9 Crore+,24 Lac+,BIHAR,Post Graduate
4,4,Swapan Majumder,BANGAON DAKSHIN (SC),BJP,2,2 Crore+,61 Lac+,WEST BENGAL,8th Pass


In [605]:
def convert_to_hund(value):
    value = str(value)
    if value == "0":
        return 0
    elif 'Crore+' in value:
        number, unit = value.split()
        number = float(number.replace(",", ""))
        return int(number * 100000)
    elif 'Lac+' in value:
        number, unit = value.split()
        number = float(number.replace(",", ""))
        return int(number*1000)
    elif 'Thou+' in value:
        number, unit = value.split()
        number = float(number.replace(",", ""))
        return int(number*10)
    elif 'Hund+' in value:
        number, unit = value.split()
        number = float(number.replace(",", ""))
        return int(number)
    else:
        return int(value)

data["Total Assets"] = data["Total Assets"].apply(convert_to_hund)
data["Liabilities"] = data["Liabilities"].apply(convert_to_hund)
data1["Total Assets"] = data1["Total Assets"].apply(convert_to_hund)
data1["Liabilities"] = data1["Liabilities"].apply(convert_to_hund)

import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(x=data.index, y=data['Total Assets'], mode='lines', name='Total Assets'))
fig.add_trace(go.Scatter(x=data1.index, y=data1['Liabilities'], mode='lines', name='Liabilities'))

fig.update_layout(title='Line Graph of Total Assets and Liabilities', xaxis_title='Index', yaxis_title='Value')

fig.show()

In [606]:
import plotly.express as px

filtered_data = data[(data['Liabilities'] >= 0) & (data['Liabilities'] <= 2000000) &
                     (data['Total Assets'] >= 0) & (data['Total Assets'] <= 8000000)]

fig = px.scatter(filtered_data, x='Total Assets', y='Liabilities', trendline="ols",
                 title="Plot with Trendline (Filtered Data)")
fig.update_traces(marker=dict(size=8, opacity=0.5))
fig.update_layout(xaxis_title="Total Assets", yaxis_title="Liabilities",
                  yaxis=dict(range=[0, 2000000]), xaxis=dict(range=[0, 8000000]))
fig.show()

In [607]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
Initial_Education = data['Education']
data['Education'] = label_encoder.fit_transform(data['Education'])

data_cpy = data
data_cpy1 = data1

print(data.columns)

data = pd.get_dummies(data, columns=['Party','state'])
data1 = pd.get_dummies(data1, columns=['Party','state'])

Index(['ID', 'Candidate', 'Constituency ∇', 'Party', 'Criminal Case',
       'Total Assets', 'Liabilities', 'state', 'Education'],
      dtype='object')


In [608]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1374 entries, 0 to 1373
Data columns (total 57 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   ID                              1374 non-null   int64 
 1   Candidate                       1374 non-null   object
 2   Constituency ∇                  1374 non-null   object
 3   Criminal Case                   1374 non-null   int64 
 4   Total Assets                    1374 non-null   int64 
 5   Liabilities                     1374 non-null   int64 
 6   Party_AAP                       1374 non-null   bool  
 7   Party_AIADMK                    1374 non-null   bool  
 8   Party_AITC                      1374 non-null   bool  
 9   Party_BJD                       1374 non-null   bool  
 10  Party_BJP                       1374 non-null   bool  
 11  Party_CPI                       1374 non-null   bool  
 12  Party_CPI(M)                    1374 non-null   

In [609]:
print("Train Data: ",data.shape,"Test Data:", data1.shape)
data1.head()

Train Data:  (2059, 58) Test Data: (1374, 57)


Unnamed: 0,ID,Candidate,Constituency ∇,Criminal Case,Total Assets,Liabilities,Party_AAP,Party_AIADMK,Party_AITC,Party_BJD,...,state_ODISHA,state_PUDUCHERRY,state_PUNJAB,state_RAJASTHAN,state_SIKKIM,state_TAMIL NADU,state_TRIPURA,state_UTTAR PRADESH,state_UTTARAKHAND,state_WEST BENGAL
0,0,Geeta Bharat Jain,MEERA BHAYANDAR,2,7000000,1100000,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,Becharam Manna,SINGUR,1,200000,13000,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
2,2,Sunil Vijay Tingre,VADGAON SHERI,3,4900000,100000,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,Asit Mazumder (Tapan),CHUNCHURA,1,200000,0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,True
4,4,Hriday Narayan Singh Patel,SAGRI,0,1600000,200000,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [610]:
import pandas as pd
import plotly.express as px

party_criminal_cases = data_cpy.groupby('Party')['Criminal Case'].count()

more_than_one_case = party_criminal_cases[party_criminal_cases > 1]
data1234 = {'Party': more_than_one_case.index, 'Count': more_than_one_case.values}
more_than_one_case_df = pd.DataFrame(data1234)
fig = px.line(more_than_one_case_df, x='Party', y='Count', title='Number of People with More than One Criminal Case per Party')
fig.show()

total_people = more_than_one_case_df['Count'].sum()

more_than_one_case_df['Percentage'] = (more_than_one_case_df['Count'] / total_people) * 100

fig = px.pie(more_than_one_case_df, names='Party', values='Percentage', title='Percentage of People with More than One Case per Party')
fig.show()

In [611]:
import plotly.graph_objects as go

label_encoder1 = LabelEncoder()

data_cpy.drop(columns=['Candidate','Constituency ∇'],inplace=True)
data_cpy1.drop(columns=['Candidate','Constituency ∇'],inplace=True)

Initial_Party = data_cpy['Party']
data_cpy['Party'] = label_encoder1.fit_transform(data_cpy['Party'])
data_cpy['state'] = label_encoder1.fit_transform(data_cpy['state'])
data_cpy1['Party'] = label_encoder1.fit_transform(data_cpy1['Party'])
data_cpy1['state'] = label_encoder1.fit_transform(data_cpy1['state'])

import plotly.express as px

for column in data_cpy.columns:
    if column != 'Education':
        fig = px.scatter(data_cpy, x=column, y='Education', color='Education', title=f'Scatter Plot for {column}')
        fig.show()

correlation_matrix = data_cpy.corr()
print("Correlation Matrix for Train Data:")
print(correlation_matrix)

annotations = []
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        font_color = 'black' if i == j else 'white'
        annotations.append(
            dict(
                text="{:.2f}".format(value),
                x=correlation_matrix.columns[j],
                y=correlation_matrix.index[i],
                xref='x1', yref='y1',
                font=dict(color=font_color),
                showarrow=False
            )
        )

fig = go.Figure(data=go.Heatmap(
                   z=correlation_matrix.values,
                   x=correlation_matrix.columns,
                   y=correlation_matrix.index,
                   colorscale='Viridis',
                   colorbar=dict(title='Correlation')
               ))

fig.update_layout(
    title='Correlation Matrix for Train Data',
    xaxis=dict(title='Columns'),
    yaxis=dict(title='Columns'),
    plot_bgcolor='black',
    annotations=annotations
)

fig.show()


correlation_matrix1 = data_cpy1.corr()
print("Correlation Matrix for Test Data:")
print(correlation_matrix)

annotations = []
for i, row in enumerate(correlation_matrix.values):
    for j, value in enumerate(row):
        font_color = 'black' if i == j else 'white'
        annotations.append(
            dict(
                text="{:.2f}".format(value),
                x=correlation_matrix.columns[j],
                y=correlation_matrix.index[i],
                xref='x1', yref='y1',
                font=dict(color=font_color),
                showarrow=False
            )
        )

fig = go.Figure(data=go.Heatmap(
                   z=correlation_matrix.values,
                   x=correlation_matrix.columns,
                   y=correlation_matrix.index,
                   colorscale='Viridis',
                   colorbar=dict(title='Correlation')
               ))

fig.update_layout(
    title='Correlation Matrix for Test data',
    xaxis=dict(title='Columns'),
    yaxis=dict(title='Columns'),
    plot_bgcolor='black',
    annotations=annotations
)

fig.show()


Correlation Matrix for Train Data:
                     ID     Party  Criminal Case  Total Assets  Liabilities  \
ID             1.000000  0.054653       0.034991      0.006593    -0.001341   
Party          0.054653  1.000000       0.110415      0.057602     0.019578   
Criminal Case  0.034991  0.110415       1.000000     -0.012985    -0.000981   
Total Assets   0.006593  0.057602      -0.012985      1.000000     0.597508   
Liabilities   -0.001341  0.019578      -0.000981      0.597508     1.000000   
state         -0.034855 -0.325900       0.074785     -0.077022    -0.031056   
Education      0.026899 -0.002965      -0.005312      0.003603     0.009539   

                  state  Education  
ID            -0.034855   0.026899  
Party         -0.325900  -0.002965  
Criminal Case  0.074785  -0.005312  
Total Assets  -0.077022   0.003603  
Liabilities   -0.031056   0.009539  
state          1.000000   0.068529  
Education      0.068529   1.000000  


Correlation Matrix for Test Data:
                     ID     Party  Criminal Case  Total Assets  Liabilities  \
ID             1.000000  0.054653       0.034991      0.006593    -0.001341   
Party          0.054653  1.000000       0.110415      0.057602     0.019578   
Criminal Case  0.034991  0.110415       1.000000     -0.012985    -0.000981   
Total Assets   0.006593  0.057602      -0.012985      1.000000     0.597508   
Liabilities   -0.001341  0.019578      -0.000981      0.597508     1.000000   
state         -0.034855 -0.325900       0.074785     -0.077022    -0.031056   
Education      0.026899 -0.002965      -0.005312      0.003603     0.009539   

                  state  Education  
ID            -0.034855   0.026899  
Party         -0.325900  -0.002965  
Criminal Case  0.074785  -0.005312  
Total Assets  -0.077022   0.003603  
Liabilities   -0.031056   0.009539  
state          1.000000   0.068529  
Education      0.068529   1.000000  


In [612]:
import plotly.express as px

data_cpy['state'] = label_encoder1.inverse_transform(data_cpy['state'])
data_cpy['Education'] =  Initial_Education
for column in data_cpy.columns:
    if column != 'Education' and column == 'state':
        fig = px.scatter(data_cpy, x=column, y='Education', color='Education', title=f'Scatter Plot for {column}')
        fig.show()


In [613]:
data

Unnamed: 0,ID,Candidate,Constituency ∇,Criminal Case,Total Assets,Liabilities,Education,Party_AAP,Party_AIADMK,Party_AITC,...,state_ODISHA,state_PUDUCHERRY,state_PUNJAB,state_RAJASTHAN,state_SIKKIM,state_TAMIL NADU,state_TRIPURA,state_UTTAR PRADESH,state_UTTARAKHAND,state_WEST BENGAL
0,0,M.K. Mohan,ANNA NAGAR,4,21100000,200000,3,False,False,False,...,False,False,False,False,False,True,False,False,False,False
1,1,Khatik Ramesh Prasad,KARERA (SC),0,100000,0,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,2,Dr. Mantar Gowda,MADIKERI,0,700000,22000,9,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3,Kundan Kumar,BEGUSARAI,0,900000,24000,9,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,4,Swapan Majumder,BANGAON DAKSHIN (SC),2,200000,61000,3,False,False,False,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,2054,V. Sasi,CHIRAYINKEEZHU,1,61000,10000,6,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2055,2055,Pushkar Lal Dangi,MAVLI,0,200000,8000,0,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2056,2056,Dr. Manju Shiwach,MODI NAGAR,0,1300000,85000,5,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2057,2057,Mansing Fattesingrao Naik,SHIRALA,1,2500000,94000,1,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [614]:
def get_constituency_type(constituency):
    if '(SC)' in constituency:
        return 1
    elif '(ST)' in constituency:
        return 2
    else:
        return 0

data['Constituency_type'] = data['Constituency ∇'].apply(get_constituency_type)
data1['Constituency_type'] = data1['Constituency ∇'].apply(get_constituency_type)

data['Is_Doctor'] = data['Candidate'].apply(lambda x: 1 if 'Dr.' in x else 0)
data1['Is_Doctor'] = data1['Candidate'].apply(lambda x: 1 if 'Dr.' in x else 0)

data['Is_Advocate'] = data['Candidate'].apply(lambda x: 1 if 'Adv.' in x else 0)
data1['Is_Advocate'] = data1['Candidate'].apply(lambda x: 1 if 'Adv.' in x else 0)

data_for_plot = data

data = data.drop(columns=['ID','Candidate','Constituency ∇'])
data1.drop(columns=['ID','Candidate','Constituency ∇'], inplace=True)

print(data.columns)
print(data1.columns)

data['Aggregate_Assets'] = data['Total Assets'] - data['Liabilities']
data1['Aggregate_Assets'] = data1['Total Assets'] - data1['Liabilities']

min_value = data['Aggregate_Assets'].min()
data['Aggregate_Assets'] = data['Aggregate_Assets'] - min_value
min_value1 = data1['Aggregate_Assets'].min()
data1['Aggregate_Assets'] = data1['Aggregate_Assets'] - min_value1
data_for_plot

Index(['Criminal Case', 'Total Assets', 'Liabilities', 'Education',
       'Party_AAP', 'Party_AIADMK', 'Party_AITC', 'Party_BJD', 'Party_BJP',
       'Party_CPI', 'Party_CPI(M)', 'Party_DMK', 'Party_INC', 'Party_IND',
       'Party_JD(S)', 'Party_JD(U)', 'Party_JMM', 'Party_NCP', 'Party_NDPP',
       'Party_NPP', 'Party_RJD', 'Party_SHS', 'Party_SP',
       'Party_Sikkim Krantikari Morcha', 'Party_TDP',
       'Party_Tipra Motha Party', 'Party_YSRCP', 'state_ANDHRA PRADESH',
       'state_ARUNACHAL PRADESH', 'state_ASSAM', 'state_BIHAR',
       'state_CHHATTISGARH', 'state_DELHI', 'state_GOA', 'state_GUJARAT',
       'state_HARYANA', 'state_HIMACHAL PRADESH', 'state_JHARKHAND',
       'state_KARNATAKA', 'state_KERALA', 'state_MADHYA PRADESH',
       'state_MAHARASHTRA', 'state_MANIPUR', 'state_MEGHALAYA',
       'state_NAGALAND', 'state_ODISHA', 'state_PUDUCHERRY', 'state_PUNJAB',
       'state_RAJASTHAN', 'state_SIKKIM', 'state_TAMIL NADU', 'state_TRIPURA',
       'state_UTTAR PRADES

Unnamed: 0,ID,Candidate,Constituency ∇,Criminal Case,Total Assets,Liabilities,Education,Party_AAP,Party_AIADMK,Party_AITC,...,state_RAJASTHAN,state_SIKKIM,state_TAMIL NADU,state_TRIPURA,state_UTTAR PRADESH,state_UTTARAKHAND,state_WEST BENGAL,Constituency_type,Is_Doctor,Is_Advocate
0,0,M.K. Mohan,ANNA NAGAR,4,21100000,200000,3,False,False,False,...,False,False,True,False,False,False,False,0,0,0
1,1,Khatik Ramesh Prasad,KARERA (SC),0,100000,0,1,False,False,False,...,False,False,False,False,False,False,False,1,0,0
2,2,Dr. Mantar Gowda,MADIKERI,0,700000,22000,9,False,False,False,...,False,False,False,False,False,False,False,0,1,0
3,3,Kundan Kumar,BEGUSARAI,0,900000,24000,9,False,False,False,...,False,False,False,False,False,False,False,0,0,0
4,4,Swapan Majumder,BANGAON DAKSHIN (SC),2,200000,61000,3,False,False,False,...,False,False,False,False,False,False,True,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,2054,V. Sasi,CHIRAYINKEEZHU,1,61000,10000,6,False,False,False,...,False,False,False,False,False,False,False,0,0,0
2055,2055,Pushkar Lal Dangi,MAVLI,0,200000,8000,0,False,False,False,...,True,False,False,False,False,False,False,0,0,0
2056,2056,Dr. Manju Shiwach,MODI NAGAR,0,1300000,85000,5,False,False,False,...,False,False,False,False,True,False,False,0,1,0
2057,2057,Mansing Fattesingrao Naik,SHIRALA,1,2500000,94000,1,False,False,False,...,False,False,False,False,False,False,False,0,0,0


In [615]:
temp = data_for_plot['Education']
data_for_plot['Education'] = Initial_Education

grouped_data = data_for_plot.groupby(['Constituency_type', 'Education']).size().reset_index(name='Count')

sc_data = grouped_data[grouped_data['Constituency_type'] == 1]

st_data = grouped_data[grouped_data['Constituency_type'] == 2]

other_data = grouped_data[grouped_data['Constituency_type'] == 0]

fig1 = px.pie(sc_data, names='Education', values='Count', title='Education Distribution in SC Constituency')
fig2 = px.pie(st_data, names='Education', values='Count', title='Education Distribution in ST Constituency')
fig3 = px.pie(other_data, names='Education', values='Count', title='Education Distribution in Other Constituencies')

fig1.show()
fig2.show()
fig3.show()
data_for_plot['Education'] = temp

In [616]:
data_cpy['Aggregate_Assets'] = data['Aggregate_Assets']
temp = data_cpy['Party']
data_cpy['Party'] = Initial_Party
more_than_10m_assets = data_cpy[data_cpy['Aggregate_Assets'] > 10000000]
party_high_assets = more_than_10m_assets.groupby('Party')['Aggregate_Assets'].count()

data007 = {'Party': party_high_assets.index, 'Count': party_high_assets.values}
high_assets_df = pd.DataFrame(data007)

fig = px.line(high_assets_df, x='Party', y='Count', title='Number of Candidates with More than 10,000,000 Aggregate Assets per Party')
fig.show()
data_cpy['Party'] = temp

total_candidates = high_assets_df['Count'].sum()

high_assets_df['Percentage'] = (high_assets_df['Count'] / total_candidates) * 100

fig = px.pie(high_assets_df, names='Party', values='Percentage', title='Percentage of Candidates with More than 10,000,000 Aggregate Assets per Party')
fig.show()

In [617]:
data

Unnamed: 0,Criminal Case,Total Assets,Liabilities,Education,Party_AAP,Party_AIADMK,Party_AITC,Party_BJD,Party_BJP,Party_CPI,...,state_SIKKIM,state_TAMIL NADU,state_TRIPURA,state_UTTAR PRADESH,state_UTTARAKHAND,state_WEST BENGAL,Constituency_type,Is_Doctor,Is_Advocate,Aggregate_Assets
0,4,21100000,200000,3,False,False,False,False,False,False,...,False,True,False,False,False,False,0,0,0,28000000
1,0,100000,0,1,False,False,False,False,True,False,...,False,False,False,False,False,False,1,0,0,7200000
2,0,700000,22000,9,False,False,False,False,False,False,...,False,False,False,False,False,False,0,1,0,7778000
3,0,900000,24000,9,False,False,False,False,True,False,...,False,False,False,False,False,False,0,0,0,7976000
4,2,200000,61000,3,False,False,False,False,True,False,...,False,False,False,False,False,True,1,0,0,7239000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,1,61000,10000,6,False,False,False,False,False,True,...,False,False,False,False,False,False,0,0,0,7151000
2055,0,200000,8000,0,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0,0,7292000
2056,0,1300000,85000,5,False,False,False,False,True,False,...,False,False,False,True,False,False,0,1,0,8315000
2057,1,2500000,94000,1,False,False,False,False,False,False,...,False,False,False,False,False,False,0,0,0,9506000


In [618]:
data['ID'] = data.index
# print(data['ID'])
import plotly.express as px

fig = px.line(data, x='ID', y="Aggregate_Assets", title='Aggregate Assets v/s ID')
fig.show()

fig = px.line(data, x='ID', y="Liabilities", title='Liabilities v/s ID')
fig.show()

fig = px.line(data, x='ID', y="Criminal Case", title='Criminal Cases v/s ID')
fig.show()
data.drop(columns=['ID'],inplace=True)

data.drop(columns=['Total Assets','Liabilities'], inplace=True)
data1.drop(columns=['Total Assets','Liabilities'], inplace=True)

In [619]:
label_encoder2 = LabelEncoder()

data_for_plot['Candidate'] = label_encoder2.fit_transform(data_for_plot['Candidate'])
data_for_plot['Constituency ∇'] = label_encoder2.fit_transform(data_for_plot['Constituency ∇'])

import plotly.graph_objects as go

correlations = {
    'Candidate': data['Education'].corr(data_for_plot['Candidate']),
    'Is_Doctor': data['Education'].corr(data_for_plot['Is_Doctor']),
    'Constituency ∇': data['Education'].corr(data_for_plot['Constituency ∇']),
    'Constituency_type': data['Education'].corr(data_for_plot['Constituency_type'])
}

fig = go.Figure(data=[
    go.Bar(name='Correlation', x=list(correlations.keys()), y=list(correlations.values()))
])

fig.update_layout(title='Changes in Correlation with Education',
                  xaxis_title='Feature', yaxis_title='Correlation with Education')

fig.show()

In [620]:
import pandas as pd
import plotly.express as px

merged_df = pd.merge(Initial_Education, data['Is_Doctor'], left_index=True, right_index=True)
merged_df = pd.merge(merged_df, data['Is_Advocate'], left_index=True, right_index=True)

is_doctor_or_advocate = (merged_df['Is_Doctor'] == 1) | (merged_df['Is_Advocate'] == 1)
filtered_df = merged_df[is_doctor_or_advocate]

education_dist = filtered_df.groupby(['Education', 'Is_Doctor', 'Is_Advocate']).size().reset_index(name='count')

sunburst_data = {
    'Education': education_dist['Education'],
    'Type': ['Doctor' if is_doc == 1 else 'Advocate' for is_doc, is_adv in zip(education_dist['Is_Doctor'], education_dist['Is_Advocate'])],
    'Count': education_dist['count']
}

fig = px.sunburst(
    sunburst_data,
    path=['Type', 'Education'],
    values='Count',
    color='Education',
    title="Distribution of Education for Doctors and Advocates",
    branchvalues="total",
    # hole=0.3,
    template="plotly_white"
)

fig.update_layout(
    margin=dict(t=50, b=50, l=50, r=50),
    font_size=14,
    font_color="black",
    title={
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    }
)

fig.show()

In [621]:
import plotly.express as px

value_counts = data['Criminal Case'].value_counts()

top_values = value_counts.head(top_n)

fig = px.bar(x=top_values.index, y=top_values.values)

fig.update_layout(xaxis_title='Criminal Case', yaxis_title='Frequency')

fig.update_layout(yaxis_type='log')

fig.show()

In [622]:
import numpy as np
import plotly.graph_objects as go
from sklearn.naive_bayes import BernoulliNB

X = data.drop(columns='Education')
y=data['Education']
alphas = np.arange(0, 1.1, 0.1)
binarizes = np.arange(0, 1.1, 0.1)
hyperparameters = [{'alpha': alpha, 'binarize': binarize} for alpha in alphas for binarize in binarizes]

validation_scores = []

for params in hyperparameters:
    X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X, y, test_size=0.21, random_state=42)

    bernoulli_nb = BernoulliNB(alpha=params['alpha'], binarize=params['binarize'])
    bernoulli_nb.fit(X_train_split, y_train_split)

    predictions = bernoulli_nb.predict(X_test_split)

    f1 = f1_score(y_test_split, predictions, average='weighted')

    validation_scores.append(f1)

fig = go.Figure(data=go.Scatter(x=[params['alpha'] for params in hyperparameters],
                                y=[params['binarize'] for params in hyperparameters],
                                mode='markers',
                                marker=dict(color=validation_scores,
                                            colorscale='Viridis',
                                            size=10,
                                            colorbar=dict(title='F1 Score')),
                                text=validation_scores,
                                hoverinfo='text'))

fig.update_layout(title='Validation Scores for Different Hyperparameter Combinations',
                  xaxis_title='Alpha', yaxis_title='Binarize')

fig.show()





alpha too small will result in numeric errors, setting alpha = 1.0e-10. Use `force_alpha=True` to keep alpha unchanged.




alpha too small will result in numeric errors, setting alpha = 1.0e-10. Use `force_alpha=True` to keep alpha unchanged.




alpha too small will result in numeric errors, setting alpha = 1.0e-10. Use `force_alpha=True` to keep alpha unchanged.




alpha too small will result in numeric errors, setting alpha = 1.0e-10. Use `force_alpha=True` to keep alpha unchanged.




alpha too small will result in numeric errors, setting alpha = 1.0e-10. Use `force_alpha=True` to keep alpha unchanged.




alpha too small will result in numeric errors, setting alpha = 1.0e-10. Use `force_alpha=True` to keep alpha unchanged.




alpha too small will result in numeric errors, setting alpha = 1.0e-10. Use `force_alpha=True` to keep alpha unchanged.




alpha too small will result in numeric errors, setting alpha = 1.0e-10. Use `force_alpha=True` to keep alpha unchanged.




alpha

In [623]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
ans = 0
for train_index, test_index in kf.split(data):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    bernoulli_nb = BernoulliNB(
        alpha=0.5,
        binarize=0.1,
        fit_prior=True,
        class_prior=None
    )
    bernoulli_nb.fit(X_train, y_train)

    score = bernoulli_nb.score(X_test, y_test)
    print(f"Accuracy on fold: {score:.10f}")
    ans += score

print(ans/5)


Accuracy on fold: 0.2354368932
Accuracy on fold: 0.2354368932
Accuracy on fold: 0.2500000000
Accuracy on fold: 0.2500000000
Accuracy on fold: 0.2238442822
0.23894361372924194


In [624]:
bernoulli_nb = BernoulliNB(
    alpha=0.5,
    binarize=0.1,
    fit_prior=True,
    class_prior=None
)
bernoulli_nb.fit(X_train_split, y_train_split)

predictions = bernoulli_nb.predict(X_test_split)
# predictions = label_encoder.inverse_transform(predictions)
predictions

array([1, 9, 5, 5, 9, 9, 5, 6, 4, 6, 1, 5, 6, 9, 5, 0, 6, 0, 5, 6, 1, 5,
       6, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 1, 9, 5, 1, 5, 4, 5, 9, 9, 0,
       5, 6, 6, 1, 3, 6, 0, 0, 9, 5, 1, 5, 1, 9, 5, 9, 5, 5, 6, 6, 6, 5,
       5, 6, 0, 6, 9, 9, 9, 0, 5, 6, 5, 5, 6, 9, 1, 0, 9, 6, 4, 5, 6, 5,
       9, 5, 5, 5, 1, 9, 6, 9, 5, 5, 5, 5, 9, 9, 9, 4, 1, 9, 9, 0, 1, 5,
       5, 3, 9, 5, 6, 0, 5, 6, 0, 9, 5, 9, 9, 5, 0, 9, 6, 9, 6, 5, 5, 5,
       6, 5, 6, 1, 5, 5, 6, 9, 5, 9, 5, 5, 9, 9, 0, 1, 5, 3, 5, 6, 5, 9,
       5, 5, 6, 9, 6, 0, 6, 5, 9, 6, 5, 5, 5, 9, 5, 6, 9, 5, 9, 1, 5, 5,
       5, 5, 6, 6, 5, 5, 5, 5, 9, 9, 5, 1, 6, 5, 5, 9, 9, 5, 9, 6, 5, 1,
       5, 6, 9, 6, 6, 5, 9, 6, 9, 9, 9, 5, 1, 9, 1, 1, 3, 5, 5, 9, 4, 5,
       0, 5, 5, 1, 5, 5, 5, 6, 5, 1, 1, 5, 6, 5, 1, 1, 9, 5, 9, 6, 6, 9,
       6, 9, 9, 6, 5, 5, 5, 1, 6, 9, 0, 0, 5, 6, 5, 5, 6, 5, 9, 9, 0, 0,
       5, 6, 9, 6, 5, 5, 1, 5, 1, 5, 9, 3, 5, 5, 1, 5, 0, 6, 9, 6, 6, 6,
       0, 9, 1, 9, 5, 5, 9, 1, 5, 1, 9, 9, 6, 6, 5,

In [625]:
f1 = f1_score(y_test_split, predictions, average='weighted')
print("F1 Score:", f1)

F1 Score: 0.21304266958043186


In [626]:
bernoulli_nb = BernoulliNB(
    alpha=0.5,
    binarize=0.1,
    fit_prior=True,
    class_prior=None
)
bernoulli_nb.fit(X_train_split, y_train_split)

predictions = bernoulli_nb.predict(data1)
predictions = label_encoder.inverse_transform(predictions)
predictions

array(['12th Pass', 'Graduate', '12th Pass', ..., 'Graduate', 'Graduate',
       'Post Graduate'], dtype=object)

In [627]:
df_pred = pd.DataFrame(predictions, columns=["Education"])
df_pred.to_csv('Submission.csv', index_label='ID')