# Data Preprocessing

## Data And Library Importing

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sn
import statsmodels.discrete.discrete_model as sm
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from IPython.display import Image
import pydotplus
plt.style.use('dark_background')
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('LS_2.0.csv', header = 0)
df.head()

Unnamed: 0,STATE,CONSTITUENCY,NAME,WINNER,PARTY,SYMBOL,GENDER,CRIMINAL\nCASES,AGE,CATEGORY,EDUCATION,ASSETS,LIABILITIES,GENERAL\nVOTES,POSTAL\nVOTES,TOTAL\nVOTES,OVER TOTAL ELECTORS \nIN CONSTITUENCY,OVER TOTAL VOTES POLLED \nIN CONSTITUENCY,TOTAL ELECTORS
0,Telangana,ADILABAD,SOYAM BAPU RAO,1,BJP,Lotus,MALE,52.0,52.0,ST,12th Pass,"Rs 30,99,414\n ~ 30 Lacs+","Rs 2,31,450\n ~ 2 Lacs+",376892,482,377374,25.330684,35.468248,1489790
1,Telangana,ADILABAD,Godam Nagesh,0,TRS,Car,MALE,0.0,54.0,ST,Post Graduate,"Rs 1,84,77,888\n ~ 1 Crore+","Rs 8,47,000\n ~ 8 Lacs+",318665,149,318814,21.399929,29.96437,1489790
2,Telangana,ADILABAD,RATHOD RAMESH,0,INC,Hand,MALE,3.0,52.0,ST,12th Pass,"Rs 3,64,91,000\n ~ 3 Crore+","Rs 1,53,00,000\n ~ 1 Crore+",314057,181,314238,21.092771,29.534285,1489790
3,Telangana,ADILABAD,NOTA,0,NOTA,,,,,,,,,13030,6,13036,0.875023,1.225214,1489790
4,Uttar Pradesh,AGRA,Satyapal Singh Baghel,1,BJP,Lotus,MALE,5.0,58.0,SC,Doctorate,"Rs 7,42,74,036\n ~ 7 Crore+","Rs 86,06,522\n ~ 86 Lacs+",644459,2416,646875,33.383823,56.464615,1937690


## Missing Values And Cleaning up The Values

In [7]:
df.isnull().sum()

STATE                                          0
CONSTITUENCY                                   0
NAME                                           0
WINNER                                         0
PARTY                                          0
SYMBOL                                       245
GENDER                                       245
CRIMINAL\nCASES                              245
AGE                                          245
CATEGORY                                     245
EDUCATION                                    245
ASSETS                                       245
LIABILITIES                                  245
GENERAL\nVOTES                                 0
POSTAL\nVOTES                                  0
TOTAL\nVOTES                                   0
OVER TOTAL ELECTORS \nIN CONSTITUENCY          0
OVER TOTAL VOTES POLLED \nIN CONSTITUENCY      0
TOTAL ELECTORS                                 0
dtype: int64

In [8]:
df[df.SYMBOL.isnull()==True]['NAME'].unique()

array(['NOTA'], dtype=object)

In [9]:
# Filter rows where 'NAME' column contains 'NOTA'
nota_rows = df[df['NAME'].str.contains('NOTA', na=False)]

# Display the filtered rows
print(nota_rows)

               STATE    CONSTITUENCY  NAME  WINNER PARTY SYMBOL GENDER  \
3          Telangana        ADILABAD  NOTA       0  NOTA    NaN    NaN   
14           Gujarat  AHMEDABAD WEST  NOTA       0  NOTA    NaN    NaN   
39       West Bengal     ALIPURDUARS  NOTA       0  NOTA    NaN    NaN   
46       Uttarakhand          ALMORA  NOTA       0  NOTA    NaN    NaN   
54    Andhra Pradesh      AMALAPURAM  NOTA       0  NOTA    NaN    NaN   
...              ...             ...   ...     ...   ...    ...    ...   
2225      Tamil Nadu    VIRUDHUNAGAR  NOTA       0  NOTA    NaN    NaN   
2230  Andhra Pradesh   VISAKHAPATNAM  NOTA       0  NOTA    NaN    NaN   
2235  Andhra Pradesh    VIZIANAGARAM  NOTA       0  NOTA    NaN    NaN   
2241       Telangana        WARANGAL  NOTA       0  NOTA    NaN    NaN   
2262       Telangana       ZAHIRABAD  NOTA       0  NOTA    NaN    NaN   

     CRIMINAL\nCASES  AGE CATEGORY EDUCATION ASSETS LIABILITIES  \
3                NaN  NaN      NaN       NaN

In [10]:
df_NOTA = df

In [11]:
# Remove rows where 'NAME' column contains 'NOTA'
df= df[~df['NAME'].str.contains('NOTA', na=False)]

In [12]:
#Cleaning up the Assets and Liabilities columns
def value_cleaner(x):
    try:
        str_temp = (x.split('Rs')[1].split('\n')[0].strip())
        str_temp_2 = ''
        for i in str_temp.split(","):
            str_temp_2 = str_temp_2+i
        return str_temp_2
    except:
        x = 0
        return x
df_REMOVE_NOTA['ASSETS'] = df_REMOVE_NOTA['ASSETS'].apply((value_cleaner))
df_REMOVE_NOTA['LIABILITIES'] = df_REMOVE_NOTA['LIABILITIES'].apply((value_cleaner))
df_REMOVE_NOTA.head()

NameError: name 'df_REMOVE_NOTA' is not defined

In [None]:
# Renaming the Columns
df.rename(columns={"CRIMINAL\nCASES": "CRIMINAL CASES", "GENERAL\nVOTES": "GENERAL VOTES", "POSTAL\nVOTES": "POSTAL VOTES","TOTAL\nVOTES": "TOTAL VOTES","OVER TOTAL ELECTORS \nIN CONSTITUENCY": "OVER TOTAL ELECTORS IN CONSTITUENCY","OVER TOTAL VOTES POLLED \nIN CONSTITUENCY": "OVER TOTAL VOTES POLLED IN CONSTITUENCY"}, inplace=True)
df.head()

In [None]:
# Cleaning up the Educational Qualification of the election contestants
df.EDUCATION.unique()

In [None]:
df.EDUCATION.replace({'Post Graduate\n':'Post Graduate'},inplace=True)
df.EDUCATION.unique()

In [None]:
# Updating the data types for the analysis
df['ASSETS']=pd.to_numeric(df['ASSETS'])
df['LIABILITIES']=pd.to_numeric(df['LIABILITIES'])
df['CRIMINAL CASES'].replace({np.NaN:0})
df['CRIMINAL CASES'] = pd.to_numeric(df['CRIMINAL CASES'], errors='coerce').fillna(0).astype(np.int64)

In [None]:
df.info()

# The Analysis Using Chart

## State and Constituency Level Analysis

### What is the distribution of Constituencies over all the states?

In [None]:
pip install geopandas

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import display, HTML

# Group by state and count the number of constituencies in each state
st_con = df.groupby('STATE').apply(lambda x: x['CONSTITUENCY'].nunique()).reset_index(name='# Constituency')

# Read the shapefile
shp_gdf = gpd.read_file('Indian_States.shp')

# Merge shapefile GeoDataFrame with your DataFrame
merged = shp_gdf.set_index('st_nm').join(st_con.set_index('STATE'))

# Plotting the GeoDataFrame with Matplotlib
fig, ax1 = plt.subplots(1, 1, figsize=(15, 10))  # 15 inches wide, 10 inches high

# Set the figure and axis background color to gray
fig.patch.set_facecolor('gray')
ax1.set_facecolor('gray')

merged.plot(column='# Constituency', cmap='inferno_r', linewidth=0.5, ax=ax1, edgecolor='0.2', legend=True)
ax1.set_title('State-wise Distribution of Indian Constituencies', fontdict={'fontsize': '15', 'fontweight': '3'})
ax1.axis('off')

# Save the Matplotlib figure to an image file
fig.savefig("matplotlib_figure.png", bbox_inches='tight', facecolor=fig.get_facecolor())

# Close the Matplotlib figure
plt.close(fig)

# Sorting the DataFrame for Plotly bar chart
st_con.sort_values(by='# Constituency', ascending=True, inplace=True)

# Plotting the bar chart with Plotly
fig2 = px.bar(st_con, y='STATE', x='# Constituency',  # Switched x and y parameters
              color='# Constituency',
              labels={'pop': 'Constituencies of India'},
              orientation='h',  # Set orientation to horizontal
              )

fig2.update_layout(
    title_text='Statewise distribution of the Constituencies all over India',
    template='plotly_dark',
    plot_bgcolor='gray',
    paper_bgcolor='gray',
    width=550,  # Set the width of the plot
    height=650  # Set the height of the plot (same as Matplotlib figure)
)

# Save the Plotly figure to an HTML file
fig2_html = fig2.to_html(full_html=False)

# Display both plots side by side
display(HTML(f"""
<div style="display: flex; justify-content: space-between;">
    <div style="width: 50%;">
        <img src="matplotlib_figure.png" style="width: 100%; height: 650px;">
    </div>
    <div style="width: 50%;">
        {fig2_html}
    </div>
</div>
"""))


**Observation:** Uttar Pradesh, Maharashtra and West Bengal- The sates have the most number of constituencies. There exists a direct relationship of count of constituencies and population- The constituencies are divided based on the population of 1971- and this shall remain till the year 2026. Although currently Bihar has a higher population, West Bengal has the 3rd highest constituency count based on the above fact.

### Lets create a Sunburst image of all the States and Constituencies

In [None]:
# Select the required columns from the DataFrame 'df'
st_con_vt = df[['STATE', 'CONSTITUENCY', 'TOTAL ELECTORS']]

# Create a sunburst plot using Plotly Express
fig = px.sunburst(
    st_con_vt,                    # The DataFrame to use for the plot
    path=['STATE', 'CONSTITUENCY'], # Define the hierarchical structure of the sunburst (STATE -> CONSTITUENCY)
    values='TOTAL ELECTORS',       # Define the values to aggregate (TOTAL ELECTORS)
    color='TOTAL ELECTORS',        # Use 'TOTAL ELECTORS' to determine the color of the segments
    color_continuous_scale='viridis_r' # Set the color scale to 'viridis_r' (reversed Viridis)
)

# Update the layout of the plot
fig.update_layout(
    title_text='Sunburst Image of State and Constituency by Voters', # Set the title of the plot
    template='plotly_dark',                                         # Use the 'plotly_dark' template for the plot's theme
    width=1050,  # Set the width of the plot
    height=750  # Set the height of the plot
)
fig.show()


## Party Level Analysis

### Which Parties have been present in most constituencies and States?

In [None]:
# Group by 'PARTY' and count the number of unique 'CONSTITUENCY' entries
prty_cnt = df.groupby('PARTY').agg({'CONSTITUENCY': 'count', 'STATE': 'nunique'}).reset_index()
prty_cnt.columns = ['PARTY', '# Constituency', '# State']

# Sort the parties by the number of constituencies they are contesting in, in descending order
prty_top_all = prty_cnt.nlargest(25, '# Constituency')

# Create a scatter plot
fig = px.scatter(prty_top_all, x='# Constituency', y='# State', color='# State', size='# Constituency',
                 hover_data=['PARTY'], title='Constituency vs Statewise participation for Top Political Parties',
                 template='plotly_dark', labels={'# Constituency': 'Constituency Count', '# State': 'State Count'})
fig.update_layout(plot_bgcolor='gray', paper_bgcolor='gray')

# Display the plot
fig.show()


**Observation :** The Bharatiya Janata Party (BJP) and Indian National Congress (INC) have participated in the most number of constituencies all over India. While BJP leads in the number of constituency contested, INC wins in terms of the number of States. While these are the major parties to contest almost all over India, we see the rest of the parties have restricted themselves to a handfull of states.

### Which party has won the most constituencies?

In [None]:
# Group by 'PARTY' and sum the 'WINNER' column to get the total wins per party
part_win = df.groupby('PARTY')['WINNER'].sum().nlargest(15).reset_index(name='# Wins')

# Create a bar plot
fig = px.bar(part_win, x='PARTY', y='# Wins', color='# Wins',
             title='Win Counts by Political Party in 2019', template='plotly_dark')

# Display the plot
fig.show()


In [None]:
'BJP', 'INC', 'DMK', 'AITC','YSRCP', 'SHS','JD(U)', 'BJD', 'BSP', 'TRS', 'LJP', 'CPI(M)', 'NCP', 'SP', 'IND'

**Observation:** As seen from the data, In 2019, BJP has won the maximum constituencies all over India. The Image below the introduction also suggests the same. The distribution of all the parties is presented below. INC, who stood 2nd in the number of victories had only 52, which is practically 1/6th of the constituencies won by BJP

### What has been the general Win vs Loss relationship for the Parties in 2019?

In [None]:
# Merge 'prty_cnt' and 'part_win' DataFrames on 'PARTY' column
prty_cnt_win = pd.merge(prty_cnt, part_win, on='PARTY')

# Calculate the number of lost constituencies for each party
prty_cnt_win['Lost'] = prty_cnt_win['# Constituency'] - prty_cnt_win['# Wins']

# Create DataFrames for won and lost constituencies
prty_wins_cnt = prty_cnt_win[['PARTY', '# Wins']].copy()
prty_loss_cnt = prty_cnt_win[['PARTY', 'Lost']].copy()

# Add a 'Verdict' column to indicate whether it's a won or lost constituency
prty_wins_cnt['Verdict'] = 'Constituency Won'
prty_loss_cnt['Verdict'] = 'Constituency Lost'

# Rename columns
prty_wins_cnt.columns = ['Party', 'Counts', 'Verdict']
prty_loss_cnt.columns = ['Party', 'Counts', 'Verdict']

# Select top 15 parties for both wins and losses
top_prty_wins_cnt = prty_wins_cnt.head(15)
prty_loss_cnt_cnt = prty_loss_cnt.head(15)

# Concatenate DataFrames for wins and losses
prt_win_loss = pd.concat([top_prty_wins_cnt, prty_loss_cnt_cnt])

# Create a bar plot
fig = px.bar(prt_win_loss, x='Party', y='Counts', color='Verdict')

# Update the layout of the plot
fig.update_layout(
    title_text='Win vs Loss Analysis for the Top Parties',  # Set the plot title
    template='plotly_dark'                                 # Use the 'plotly_dark' template for the plot's theme
)

# Display the plot
fig.show()


### What has been the performance of the Parties Statewise in Uttar Pradesh, Maharashtra and West Bengal?

In [None]:
import pandas as pd
import plotly.express as px

# Filter the DataFrame for Uttar Pradesh, Maharashtra, and West Bengal
selected_states = ['Uttar Pradesh', 'Maharashtra', 'West Bengal']
df_selected_states = df[df['STATE'].isin(selected_states)]

# Group by 'PARTY' and 'STATE' and count the number of seats
party_state_performance = df_selected_states.groupby(['PARTY', 'STATE']).size().reset_index(name='Performance')

# Create a bar plot
fig = px.bar(party_state_performance, x='PARTY', y='Performance', color='STATE',
             title='Performance of Parties Statewise in Uttar Pradesh, Maharashtra, and West Bengal',
             labels={'Performance': 'Number of Seats', 'PARTY': 'Party'},
             template='plotly_dark')

# Sort the parties by their total performance in descending order
sorted_parties = party_state_performance.groupby('PARTY')['Performance'].sum().sort_values(ascending=False).index
fig.update_xaxes(categoryorder='array', categoryarray=sorted_parties)

# Show the plot
fig.show()


In [None]:
import plotly.express as px

# Filter the DataFrame for Uttar Pradesh, Maharashtra, West Bengal, Bihar, Tamil Nadu, Madhya Pradesh, Andhra Pradesh, and NCT OF Delhi
selected_states = ['Uttar Pradesh', 'Maharashtra', 'West Bengal', 'Bihar', 'Tamil Nadu', 'Madhya Pradesh', 'Andhra Pradesh', 'NCT OF Delhi']
df_selected_states = df[df['STATE'].isin(selected_states)]

# Group by 'PARTY' and 'STATE' and count the number of seats
party_state_performance = df_selected_states.groupby(['PARTY', 'STATE']).size().reset_index(name='Performance')

# Calculate the total performance of each party across all states
party_performance_total = party_state_performance.groupby('PARTY')['Performance'].sum().reset_index(name='Total Performance')

# Select the top 15 parties based on total performance
top_15_parties = party_performance_total.nlargest(20, 'Total Performance')['PARTY']

# Filter the DataFrame to include only the rows corresponding to the top 15 parties
party_state_performance_top15 = party_state_performance[party_state_performance['PARTY'].isin(top_15_parties)]

# Create a bar plot
fig = px.bar(party_state_performance_top15, x='PARTY', y='Performance', color='STATE',
             title='Top 15 Parties: Performance Statewise',
             labels={'Performance': 'Number of Seats', 'PARTY': 'Party'},
             template='plotly_dark')

# Sort the parties by their total performance in descending order
sorted_parties = party_state_performance_top15.groupby('PARTY')['Performance'].sum().sort_values(ascending=False).index
fig.update_xaxes(categoryorder='array', categoryarray=sorted_parties)

# Show the plot
fig.show()


In [None]:
import pandas as pd
import plotly.express as px

# Filter the DataFrame for Uttar Pradesh, Maharashtra, and West Bengal
selected_states = ['Uttar Pradesh', 'Maharashtra', 'West Bengal']
df_selected_states = df[df['STATE'].isin(selected_states)]

# Group by 'PARTY' and 'STATE' and count the number of winners
party_state_counts = df_selected_states[df_selected_states['WINNER'] == 1].groupby(['PARTY', 'STATE']).size().reset_index(name='Counts')

# Select top 15 parties based on the total counts of winners
top_15_parties = party_state_counts.groupby('PARTY')['Counts'].sum().nlargest(15).index

# Filter the DataFrame to include only the top 15 parties
party_state_counts_top15 = party_state_counts[party_state_counts['PARTY'].isin(top_15_parties)]
# Create a bar plot
fig = px.bar(party_state_counts_top15, x='PARTY', y='Counts', color='STATE',
             title='Top 15 Parties: Number of Winners in Uttar Pradesh, Maharashtra, and West Bengal',
             labels={'Counts': 'Number of Winners', 'PARTY': 'Top 15 Parties', 'STATE': 'State'},
             template='plotly_dark')

# Sort the parties by their total performance in descending order
sorted_parties = party_state_counts_top15.groupby('PARTY')['Counts'].sum().sort_values(ascending=False).index
fig.update_xaxes(categoryorder='array', categoryarray=sorted_parties)

# Show the plot
fig.show()


**Observation:** As seen in the above chart, the 2019 elections have been extremely lucky for parties like BJP,SHS or DMK. But it has been a major failure for the rest of the parties, where they have lost more than they won.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Filter rows where 'NAME' column contains 'NOTA'
nota_rows = df_NOTA[df_NOTA['NAME'].str.contains('NOTA', na=False)]

# Group by 'STATE' and sum 'TOTAL\nVOTES'
statewise_votes = nota_rows.groupby('STATE')['TOTAL\nVOTES'].sum().reset_index()

# Sort the data by 'TOTAL\nVOTES' for better visualization
statewise_votes = statewise_votes.sort_values(by='TOTAL\nVOTES', ascending=False)

colors = plt.cm.YlGnBu(np.linspace(0, 1, len(statewise_votes)))
# Plot the results
plt.figure(figsize=(14, 8))
plt.bar(statewise_votes['STATE'], statewise_votes['TOTAL\nVOTES'], color=colors)
plt.xlabel('State')
plt.ylabel('Total NOTA Votes')
plt.title('State-wise NOTA Vote Count')
plt.xticks(rotation=90)
plt.show()


## Politician Level Analytics

### What is the Gender Ratio of the Contestants? Also the Gender Ratio of the Winners?

In [None]:
df_gndr = df
# Calculate overall gender counts
gndr_overall = df_gndr.groupby('GENDER')['NAME'].count().reset_index(name='Counts')
gndr_overall['Category'] = 'Overall Gender Ratio'
# Filter out winners
winners = df_gndr[df_gndr['WINNER'] == 1]
# Calculate gender counts for winners
gndr_winner = winners.groupby('GENDER')['NAME'].count().reset_index(name='Counts')
gndr_winner['Category'] = 'Winning Gender Ratio'
# Concatenate overall and winner gender counts
gndr_overl_win = pd.concat([gndr_winner, gndr_overall])
# Create a grouped bar plot
fig = px.bar(gndr_overl_win, x='GENDER', y='Counts', color='Category', barmode='group')
fig.update_layout(
    title_text='Participation vs Win Counts analysis for the Genders',  # Set the plot title
    template='plotly_dark')
fig.show()


**Observation:** Out of the total list of participants only 12.78% (258 out of 2018) are female politicians, while 87.21% (1760 out of 2018) are male. Upon considering the winners, 14.1% (76 out of 463) are female politicians, while 85.9% are male politicians. The Gender ratio is not very well distributed as can be seen from the above presentation.

### What is the Educational Qualification of our politicians?

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

# Calculate overall education qualification counts
ed_cnt = df.groupby('EDUCATION')['PARTY'].count().reset_index(name='Counts')

# Add trace for overall education qualification
fig.add_trace(
    go.Pie(labels=ed_cnt['EDUCATION'], values=ed_cnt['Counts'], pull=[0.1, 0.2, 0, 0.1, 0.2, 0, 0.1, 0.2, 0, 0.1, 0.2, 0.1],
          title='Education Qualification Analysis of all'),
    1, 1)

# Filter out winners
df_win = df[df['WINNER'] == 1]

# Calculate education qualification counts for winners
ed_win_cnt = df_win.groupby('EDUCATION')['PARTY'].count().reset_index(name='Counts')

# Add trace for education qualification of winners
fig.add_trace(
    go.Pie(labels=ed_win_cnt['EDUCATION'], values=ed_win_cnt['Counts'], pull=[0.1, 0.2, 0, 0.1, 0.2, 0, 0.1, 0.1, 0.2, 0, 0.1, 0.2], 
           title='Education Qualification of the Winners'),
    1, 2)

# Update layout
fig.update_layout(
    title_text='Education Qualification Analysis',  # Set the title
    template='plotly_dark',                          # Use the 'plotly_dark' template
    grid=dict(rows=1, columns=2),                     # Set the grid for subplots
    height=500 
)

fig.show()


**Observation:** The total percentage of Graduate+ educated people contesting in the election is 67.12%, which has increased to 72.17% of the winners. This is actually a positive sign, as educated politicians are a very big factor towards a country's development. But still around 28% of the politicians have received no professional degree. Hope with passing time, we improve upon this factor, and consider the educational qualification as a primary requirement while voting!

### What is the relationship of Age and Politics?

In [None]:
# Group by age and gender and count the number of politicians in each group
age_cnt = df.groupby(['AGE', 'GENDER'])['NAME'].count().reset_index(name='Counts')

# Create a histogram with violin plot marginal for age counts distribution among politicians
fig = px.histogram(age_cnt, x="AGE", y='Counts', color='GENDER', marginal='violin', title='Age Counts Distribution among the politicians')

# Update layout
fig.update_layout(
    title_text='Age Counts Distribution among the politicians',  # Set the title
    template='plotly_dark',
    width=800,  # Set the width of the plot
    height=850  # Set the height of the plot
)
fig.show()


**Observation:** Most Number of female politicians have their average age between 45-50, while for male politician, it ranges from 50-60 range. The average age of male politians is more as compared to female politicians contesting for the Lok Sabha elections.

### What relation does the Politician category have with the election results?

In [None]:
df_cat = df
# Calculate overall category counts
cat_overall = df_cat.groupby('CATEGORY')['NAME'].count().reset_index(name='Counts')
cat_overall['Category'] = 'Overall Category Counts'

# Filter out winners
winners = df[df['WINNER'] == 1]

# Calculate category counts for winners
cat_winner = winners.groupby('CATEGORY')['NAME'].count().reset_index(name='Counts')
cat_winner['Category'] = 'Winning Category Ratio'

# Concatenate overall and winner category counts
cat_overl_win = pd.concat([cat_winner, cat_overall])

# Create a grouped bar plot
fig = px.bar(cat_overl_win, x='CATEGORY', y='Counts', color='Category', barmode='group')

# Update layout
fig.update_layout(
    title_text='Participation vs Win Counts for the Category in Politics',  # Set the title
    template='plotly_dark')                                               
fig.show()


**Observation:** The Category participation of General-SC-ST have been in the ratio of 68.97:18.97:12.04- while as of the winners, the ratios have been modified to 74.02:15.76:10:20

### Have the politicians been involved with criminal activities?

In [None]:
# Define bins for criminal cases ranges
bins = [-1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, float('inf')]
labels = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100', '100-above']

# defining df_Criminal_range 
df_Criminal_range=df
# Add a new column to the DataFrame indicating the bin for each criminal cases value
df_Criminal_range['Criminal Cases Range'] = pd.cut(df_Criminal_range['CRIMINAL CASES'], bins=bins, labels=labels, right=False)

# Filter the DataFrame for winners and overall categories separately
winners = df_Criminal_range[df_Criminal_range['WINNER'] == 1]
overall = df_Criminal_range

# Group by criminal cases range and count the number of politicians in each group for winners
winners_grouped = winners.groupby('Criminal Cases Range').size().reset_index(name='Winner Counts')

# Group by criminal cases range and count the number of politicians in each group for overall categories
overall_grouped = overall.groupby('Criminal Cases Range').size().reset_index(name='Overall Counts')

# Merge the results for winners and overall categories into a single DataFrame
merged = pd.merge(winners_grouped, overall_grouped, on='Criminal Cases Range')

# Plot the grouped counts
fig = px.bar(merged, x='Criminal Cases Range', y=['Winner Counts', 'Overall Counts'],
             barmode='group', labels={'value': 'Politicians Count', 'variable': 'Category'},
             title='Winner and Overall Category Counts by Criminal Cases Range')
fig.update_layout(template='plotly_dark')
fig.show()


### Age Group wise Criminal Cases

In [None]:

# Define the bins for age groups
bins = [0, 20, 30, 40, 50, 60, 70, float('inf')]
labels = ['Below 20', '20-30', '30-40', '40-50', '50-60', '60-70', '70 above']

# defining df_Age_group is already defined and contains the necessary columns
df_Age_group = df
# Create a new column 'Age Group' with the bins
df_Age_group['Age Group'] = pd.cut(df_Age_group['AGE'], bins=bins, labels=labels, right=False)

# Group by 'Age Group' and count the number of criminals in each group
criminal_counts = df_Age_group[df_Age_group['CRIMINAL CASES'] > 0].groupby('Age Group')['NAME'].count().reset_index(name='Criminal Count')

# Create a bar chart with gradient colors
fig = px.bar(criminal_counts, x='Age Group', y='Criminal Count',
             title='Criminal Count by Age Group',
             labels={'Criminal Count': 'Criminal Count', 'Age Group': 'Age Group'},
             template='plotly_dark',
             color='Criminal Count',  # Use 'Criminal Count' for color mapping
             color_continuous_scale='viridis',  # Specify the color scale
             )

# Show the plot
fig.show()


### Party Wise Count of Criminal Cases 

In [None]:
import pandas as pd
import plotly.express as px

# Group by 'PARTY' and count the number of criminal cases
party_criminal_cases = df[df['CRIMINAL CASES'] > 0].groupby('PARTY')['CRIMINAL CASES'].count().reset_index()
party_criminal_cases.columns = ['PARTY', 'Criminal Cases']

# Select top 15 parties based on the count of criminal cases
top_15_parties = party_criminal_cases.nlargest(15, 'Criminal Cases')

# Create bar plot with color-coded bars
fig = px.bar(top_15_parties, x='PARTY', y='Criminal Cases',
             title='Count of Criminal Cases for Top 15 Parties',
             labels={'Criminal Cases': 'Count of Criminal Cases', 'PARTY': 'Party'},
             template='plotly_dark',
             color='Criminal Cases',  # Color by the count of criminal cases
             color_continuous_scale='viridis',  # Use a sequential color scale
             )

# Show the plot
fig.show()


**Observations:** Many politicians have been associated with criminal activities. Always these cases pressed need not be genuine, but obviously, when its multiple- this is a serious issue. We must take the responsibility while voting, as its our duty to choose the right person- as a duty towards the nation.

### Plotting the Assets vs Liabilities amount for Winning Politicians (Plotted w.r.t State)

In [None]:
import plotly.express as px

# Filter the DataFrame for winning politicians and sort by assets
win_as_liab_name = df[df['WINNER'] == 1].sort_values(by='ASSETS', ascending=False)

# Create the scatter plot using Plotly
fig = px.scatter(win_as_liab_name, x='ASSETS', y='LIABILITIES', 
                 color='STATE', size='ASSETS', 
                 hover_data=['NAME', 'PARTY', 'CONSTITUENCY', 'STATE', 'WINNER'],
                 title='Assets vs Liabilities for the Winning Politicians', template='plotly_dark')
fig.update_layout(height=500)
# Show the Plotly figure
fig.show()


### For top 15 Parties ASSETS And LIABILITIES

In [None]:
import pandas as pd
import plotly.express as px

# Group by 'PARTY' and calculate total assets and liabilities
party_assets_liabilities = df.groupby('PARTY').agg({'ASSETS': 'sum', 'LIABILITIES': 'sum'}).reset_index()

# Select top 15 parties based on total assets
top_15_parties = party_assets_liabilities.nlargest(15, 'ASSETS')

# Melt the DataFrame to prepare for clustered column chart
melted_df = pd.melt(top_15_parties, id_vars='PARTY', var_name='Financials', value_name='Amount')

# Create clustered column chart
fig = px.bar(melted_df, x='PARTY', y='Amount', color='Financials',
             barmode='group', title='Assets and Liabilities for Top 15 Parties',
             labels={'Amount': 'Amount (in Rupees)', 'PARTY': 'Party'}, template='plotly_dark')

# Show the plot
fig.show()


**Observations:** The assets and liabilities of the Winning politicians have been plotted. The parameters vary largely depending on the business/services they are associated with besides politics. No valid correlation could be inferred with respect to assets and liabilities.

# Model Preparing

## DATA manipulation

In [None]:
df.head()

* State: Uttar Pradesh, Maharashtra, West Bengal, 'Bihar', 'Tamil Nadu', 'Madhya Pradesh', 'Andhra Pradesh' top 7 CONSTITUENCY wise
* Party: Top 20 Party
* CONSTITUENCY, NAME, SYMBOL, GENERAL VOTES, POSTAL VOTES, OVER TOTAL VOTES POLLED IN CONSTITUENCY, Criminal Cases Range, Age Group : remove
* Divide all education type: 
  1. High Education: This group will include education types such as 'Post Graduate', 'Doctorate', and 'Graduate Professional'.
  2. 
Medium Education: This group will include education types such as 'Graduate' and '12th Pass'
  3. 
Low Education: This group will include all other education types such as '10th Pass', '8th Pass', '5th Pass', 'Literate', 'Illiterate', 'Others', and 'Not Available'.

In [None]:
# Top 5 state CONSTITUENCY wise
Top7_selected_states = ['Uttar Pradesh', 'Maharashtra', 'West Bengal', 'Bihar', 'Tamil Nadu', 'Madhya Pradesh', 'Andhra Pradesh']
df_7selected_states = df[df['STATE'].isin(Top7_selected_states)]
# Print the column names of the DataFrame
print(df_7selected_states.columns)

In [None]:
# Drop specified columns
columns_to_drop = ['CONSTITUENCY', 'NAME', 'SYMBOL', 'GENERAL VOTES', 
                   'POSTAL VOTES', 'OVER TOTAL VOTES POLLED IN CONSTITUENCY', 
                   'Criminal Cases Range', 'Age Group']
cleaned_df_7selected_states = df_7selected_states.drop(columns=columns_to_drop)
# Print the column names of the DataFrame
print(cleaned_df_7selected_states.columns)

In [None]:
# Function to categorize education types into groups
def categorize_education(education):
    higher_education = ['Post Graduate', 'Doctorate', 'Graduate Professional']
    standard_education = ['Graduate', '12th Pass']
    if education in higher_education:
        return 'Higher Education'
    elif education in standard_education:
        return 'Standard Education'
    else:
        return 'Primary Education'

# Apply the categorize_education function to create a new column 'Education Group'
cleaned_df_7selected_states['Education Group'] = cleaned_df_7selected_states['EDUCATION'].apply(categorize_education)
cleaned_df_7selected_states_EG = cleaned_df_7selected_states.drop(columns= 'EDUCATION')

# Use value_counts to get unique names and their counts in the 'Education Group' column
education_group_counts = cleaned_df_7selected_states_EG['Education Group'].value_counts()

# Print the unique names and their counts
print(education_group_counts)


In [None]:
cleaned_df_7selected_states_EG.info()

In [None]:
# List of top 10 parties
top_20_parties = ['BJP', 'INC', 'VBA', 'AITC', 'SHS' , 'BSP', 'CPI(M)', 'NCP', 'SP', 'IND','NTK','TDP','YSRCP','MNM','DMK','AIADMK','RJD','JnP','JD(U)','SBSP']

# Filter the DataFrame to include only the top 10 parties
candidates_df = cleaned_df_7selected_states_EG[cleaned_df_7selected_states_EG['PARTY'].isin(top_20_parties)]
candidates_df.info()


## EDA

### Dummy Variables

In [None]:
 cat=candidates_df.dtypes=="object"
cat_col=list(cat[cat].index)
print(cat_col)

In [None]:
State_counts = candidates_df['STATE'].value_counts()

# Print the unique names and their counts
print(State_counts)

In [None]:
dummy_candidates_df= pd.get_dummies(candidates_df, columns=['STATE', 'PARTY', 'GENDER', 'CATEGORY', 'Education Group'], drop_first=True, dtype=int)
dummy_candidates_df.head()

### Outlier treatment 

In [None]:
dummy_candidates_df.describe()

In [None]:
plt.figure(figsize=(13,3))
sns.boxplot(data=dummy_candidates_df)
plt.xticks(rotation=90)
plt.show()

In [None]:
import pandas as pd

# Calculate IQR
Q1 = dummy_candidates_df.quantile(0.25)
Q3 = dummy_candidates_df.quantile(0.75)
IQR = Q3 - Q1

# Filter out outliers
#df_no_outliers = dummy_candidates_df[~((dummy_candidates_df < (Q1 - 1.5 * IQR)) |(dummy_candidates_df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Capping outliers for 'ASSETS' and 'LIABILITIES'
dummy_candidates_df['ASSETS'] = dummy_candidates_df['ASSETS'].clip(lower=Q1['ASSETS'] - 1.5 * IQR['ASSETS'], upper=Q3['ASSETS'] + 1.5 * IQR['ASSETS'])
dummy_candidates_df['LIABILITIES'] = dummy_candidates_df['LIABILITIES'].clip(lower=Q1['LIABILITIES'] - 1.5 * IQR['LIABILITIES'], upper=Q3['LIABILITIES'] + 1.5 * IQR['LIABILITIES'])



In [None]:
plt.figure(figsize=(13,3))
sns.boxplot(data=dummy_candidates_df)
plt.xticks(rotation=90)
plt.show()

### **The Correlation Matrix**

In [None]:
corri = dummy_candidates_df.corr()
mask = np.triu(np.ones_like(corri, dtype=bool))
plt.figure(figsize=(25,19))
sns.heatmap(corri, annot=True, mask = mask)
plt.xticks(rotation=45)
plt.show()

#### **Variance Inflation Factor** (VIF) 
it is a measure used to quantify multicollinearity in a set of predictor variables in a regression analysis. It assesses how much the variance of an estimated regression coefficient is inflated due to multicollinearity in the model. High VIF values indicate high multicollinearity, which can cause issues with the interpretation and stability of the regression coefficients.

**Treatment for High VIF:**
Identify High VIF Variables: Look for variables with VIF values greater than a certain threshold, typically 10.

**Address Multicollinearity:** High VIF values suggest that the variable is highly correlated with other predictor variables in the model. To address multicollinearity, consider the following options:

 1. Remove the Variable: If the variable is not essential or redundant with other predictors, removing it can reduce multicollinearity.
 2. Combine Variables: If two or more highly correlated variables are conceptually similar, you can create a composite variable by averaging or summing them.
 3. Keep One Variable: Keep the variable with the most relevance to the research question or domain knowledge and remove the others.


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature
vif = pd.DataFrame()
vif["Variable"] = dummy_candidates_df.columns
vif["VIF"] = [variance_inflation_factor(dummy_candidates_df.values, i) for i in range(dummy_candidates_df.shape[1])]

print(vif)
#If you find variables with a very high VIF (typically > 10), you may need to remove or combine them.

In [None]:
# Drop specified columns
columns_to_drop = ['TOTAL VOTES','TOTAL ELECTORS']

# Remove or combine these variables from the dataset
dummy_candidates_df_reduced = dummy_candidates_df.drop(columns=columns_to_drop)

# Reassess VIF values for the remaining variables
vif_reduced = pd.DataFrame()
vif_reduced["Variable"] = dummy_candidates_df_reduced.columns
vif_reduced["VIF"] = [variance_inflation_factor(dummy_candidates_df_reduced.values, i) for i in range(dummy_candidates_df_reduced.shape[1])]

print(vif_reduced)


### **x-y split**

In [None]:
dummy_candidates_df_reduced.shape

In [None]:
x_multi = dummy_candidates_df_reduced.loc[:, dummy_candidates_df_reduced.columns !="WINNER"]
y_multi = dummy_candidates_df_reduced['WINNER']

In [None]:
print(x_multi.shape, y_multi.shape)

### **Test-Train Split**

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_multi, y_multi ,test_size=0.2,random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

# Model Building

## Model_1: Multiple Logisctic Regression

### Multiple Losgistic Regression using sklearn library

In [None]:
from sklearn.linear_model import LogisticRegression
clf_lrs_multi = LogisticRegression()
clf_lrs_multi.fit(x_multi,y_multi)

In [None]:
y_pred_lrs = clf_lrs_multi.predict(x_multi)

acc_lrs = accuracy_score(y_multi, y_pred_lrs)
f1_lrs = f1_score(y_multi, y_pred_lrs, average='weighted')
roc_lrs = roc_auc_score(y_multi, y_pred_lrs)

print("Confusion Matrix: ")
print(confusion_matrix(y_multi, clf_lrs_multi.predict(x_multi)))
print('Accuracy: ', acc_lrs,'\nF1 Score: ', f1_lrs, '\nAUC(ROC): ', roc_lrs )
print("Classification Report: ")
print(classification_report(y_multi, y_pred_lrs))

The issue with getting zero values in the confusion matrix for logistic regression suggests that the model is failing to predict one of the classes. This could be due to several reasons, including issues with data preprocessing, class imbalance, or model parameters.
Here are some steps to troubleshoot and rectify the issue:
1. **Check Class Imbalance:** If your dataset is highly imbalanced (i.e., one class is significantly more frequent than the other), the logistic regression model might end up predicting only the majority class. You can use techniques such as oversampling the minority class, undersampling the majority class, or using SMOTE (Synthetic Minority Over-sampling Technique).
2. **Standardize Features:** Ensure that your features are standardized (i.e., they have zero mean and unit variance). Logistic regression can be sensitive to the scale of the features.
3. **Adjust Threshold:** The default decision threshold for logistic regression is 0.5. If your data is imbalanced, adjusting this threshold can help improve the prediction of the minority class.
4. **Regularization:** Ensure that you are using appropriate regularization. Logistic regression models often benefit from regularization (L1 or L2) to avoid overfitting and improve generalization.
5. **Evaluate Model Performance:** Use metrics such as precision, recall, F1-score, and ROC-AUC to better understand model performance, especially in the context of imbalanced datasets.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Handle class imbalance
smote = SMOTE()
x_res, y_res = smote.fit_resample(x_train, y_train)

# Standardize features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_res)
x_test_scaled = scaler.transform(x_test)

# Logistic Regression with regularization
log_reg = LogisticRegression(C=0.01, penalty='l2', solver='liblinear')
log_reg.fit(x_train_scaled, y_res)

# Predict probabilities and adjust threshold
y_prob = log_reg.predict_proba(x_train_scaled)[:, 1]
threshold = 0.3
y_train_pred_adjusted = (y_prob >= threshold).astype(int)
acc_train_lrs = accuracy_score(y_train, y_train_pred_rfclf)
print("Training Data Metrics:")
print('Accuracy: ', acc_train_lrs)

# Predict probabilities and adjust threshold for testing data
y_prob = log_reg.predict_proba(x_test_scaled)[:, 1]
threshold = 0.3
y_pred_adjusted = (y_prob >= threshold).astype(int)


# Confusion Matrix and Classification Report
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_adjusted))

acc_lrs_AT1 = accuracy_score(y_test, y_pred_adjusted)
f1_lrs_AT1 = f1_score(y_test, y_pred_adjusted, average='weighted')
roc_lrs_AT1 = roc_auc_score(y_test, y_pred_adjusted)

print('Accuracy: ', acc_lrs_AT1, '\nF1 Score: ', f1_lrs_AT1, '\nAUC(ROC): ', roc_lrs_AT1)
print("Classification Report: \n", classification_report(y_test, y_pred_adjusted))


### Multiple Logistic Regression using Statsmodel.api

In [None]:
import statsmodels.api as sn
import statsmodels.discrete.discrete_model as sm
x_cons_multi = sn.add_constant(x_multi)
logit_multi = sm.Logit(y_multi,x_cons_multi).fit()

In [None]:
logit_multi.summary()

## Model_2: Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf_lda = LinearDiscriminantAnalysis()
clf_lda.fit(x_multi, y_multi)

In [None]:
y_pred_lda = clf_lda.predict(x_multi)

acc_lda = accuracy_score(y_multi, y_pred_lda)
f1_lda = f1_score(y_multi, y_pred_lda, average='weighted')
roc_lda = roc_auc_score(y_multi, y_pred_lda)


print("Confusion Matrix:\n",confusion_matrix(y_multi, clf_lda.predict(x_multi)))
print('Accuracy: ', acc_lda,'\nF1 Score: ', f1_lda, '\nAUC(ROC): ', roc_lda)
print("Classification Report: ")
print(classification_report(y_multi, y_pred_lda))

## Model_3: K-Nearest Neighbors

In [None]:
from sklearn import preprocessing
scaler1 = preprocessing.StandardScaler().fit(x_train)
x_train_s= scaler1.transform(x_train)

In [None]:
scaler2 = preprocessing.StandardScaler().fit(x_test)
x_test_s= scaler2.transform(x_test)

### KNN With n_neighbors = 1

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn_1 = KNeighborsClassifier(n_neighbors=1)
clf_knn_1.fit(x_train_s, y_train)

In [None]:


# Trainng data 
acc_train_knn1 = accuracy_score(y_train, clf_knn_1.predict(x_train_s))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_knn1)

# Testing data 
y_pred_knn1 = clf_knn_1.predict(x_test_s)
acc_knn1 = accuracy_score(y_test, y_pred_knn1)
f1_knn1 = f1_score(y_test, y_pred_knn1, average='weighted')
roc_knn1 = roc_auc_score(y_test, y_pred_knn1)


print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test, clf_knn_1.predict(x_test_s)))
print('Accuracy: ', acc_knn1,'\nF1 Score: ', f1_knn1, '\nAUC(ROC): ', roc_knn1)
print("Classification Report: ")
print(classification_report(y_test, y_pred_knn1))

Achieving perfect scores on the training data with a complex model like a Random Forest, especially with a high number of estimators, is a strong indication of overfitting. The model is likely memorizing the training data rather than learning generalizable patterns. To address this, **tuning hyperparameters** (Use techniques like cross-validation and GridSearchCV to find optimal hyperparameters.) to improve generalization

### KNN With n_neighbors = 3

In [None]:
clf_knn_3 = KNeighborsClassifier(n_neighbors=3)
clf_knn_3.fit(x_train_s, y_train)

In [None]:
# Trainng data 
acc_train_knn3 = accuracy_score(y_train, clf_knn_3.predict(x_train_s))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_knn3)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test, clf_knn_3.predict(x_test_s)))
y_pred_knn3 = clf_knn_3.predict(x_test_s)

acc_knn3 = accuracy_score(y_test, y_pred_knn3)
f1_knn3 = f1_score(y_test, y_pred_knn3, average='weighted')
roc_knn3 = roc_auc_score(y_test, y_pred_knn3)

print('Accuracy: ', acc_knn3,'\nF1 Score: ', f1_knn3, '\nAUC(ROC): ', roc_knn3)
print("Classification Report: ")
print(classification_report(y_test, y_pred_knn3))

### Finding the Best K using Gridsearch

In [None]:
from sklearn.model_selection import GridSearchCV

#Creating a dictionary of n_neighbors
params = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20,30,50,75,100],
          'weights': ['uniform', 'distance'],
          'p': [1, 2]}

#creating a object
grid_search_cv = GridSearchCV(KNeighborsClassifier(), params)
grid_search_cv.fit(x_train_s, y_train)

In [None]:
grid_search_cv.best_params_

In [None]:
optimised_KNN = grid_search_cv.best_estimator_

#### Model Performance After Grid Search

In [None]:
# Trainng data 
acc_train_knnop = accuracy_score(y_train, optimised_KNN.predict(x_train_s))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_knnop)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test, optimised_KNN.predict(x_test_s)))
y_pred_knnop = optimised_KNN.predict(x_test_s)

acc_knnop = accuracy_score(y_test, y_pred_knnop)
f1_knnop = f1_score(y_test, y_pred_knnop, average='weighted')
roc_knnop = roc_auc_score(y_test, y_pred_knnop)

print('Accuracy: ', acc_knnop,'\nF1 Score: ', f1_knnop, '\nAUC(ROC): ', roc_knnop)
print("Classification Report: ")
print(classification_report(y_test, y_pred_knnop))

## Model_4: DecisionTreeClassifier

In [None]:
from sklearn import tree
clftree = tree.DecisionTreeClassifier(max_depth = 3)
clftree.fit(x_train, y_train)

In [None]:
dot_data = tree.export_graphviz(clftree, out_file=None,feature_names= x_train.columns, filled = True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

### Model Performance

In [None]:
# Trainng data 
acc_train_clftree = accuracy_score(y_train, clftree.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_clftree)

# Testing data 
print("\nTesting Data Metrics:")
y_pred_clftree = clftree.predict(x_test)

acc_clftree = accuracy_score(y_test, y_pred_clftree)
f1_clftree = f1_score(y_test, y_pred_clftree, average='weighted')
roc_clftree = roc_auc_score(y_test, y_pred_clftree)

print("Confusion Matrix:\n",confusion_matrix(y_test, clftree.predict(x_test)))
print('Accuracy: ', acc_clftree,'\nF1 Score: ', f1_clftree, '\nAUC(ROC): ', roc_clftree)
print("Classification Report: ")
print(classification_report(y_test, y_pred_clftree))

### Controlling Tree growth

In [None]:
clftree2 = tree.DecisionTreeClassifier(min_samples_leaf = 20, max_depth=4)
clftree2.fit(x_train, y_train)
dot_data1 = tree.export_graphviz(clftree2, out_file=None,feature_names= x_train.columns, filled = True)
graph2 = pydotplus.graph_from_dot_data(dot_data1)
Image(graph2.create_png())

#### Model Performance after Controlling the tree growth

In [None]:
# Trainng data 
acc_train_clftree2 = accuracy_score(y_train, clftree2.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_clftree2)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test, clftree2.predict(x_test)))
y_pred_clftree2 = clftree2.predict(x_test)

acc_clftree2 = accuracy_score(y_test, y_pred_clftree2)
f1_clftree2 = f1_score(y_test, y_pred_clftree2, average='weighted')
roc_clftree2 = roc_auc_score(y_test, y_pred_clftree2)

print('Accuracy: ', acc_clftree2,'\nF1 Score: ', f1_clftree2, '\nAUC(ROC): ', roc_clftree2)
print("Classification Report: ")
print(classification_report(y_test, y_pred_clftree2))

### Bagging

In [None]:
clftree3 = tree.DecisionTreeClassifier()
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier(base_estimator=clftree3, n_estimators=1000,
                            bootstrap=True, n_jobs=-1,
                            random_state=42)
bag_clf.fit(x_train, y_train)

In [None]:
# Trainng data 
acc_train_bag = accuracy_score(y_train, bag_clf.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_bag)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test, bag_clf.predict(x_test)))
y_pred_bag = bag_clf.predict(x_test)

acc_bag = accuracy_score(y_test, y_pred_bag)
f1_bag = f1_score(y_test, y_pred_bag, average='weighted')
roc_bag = roc_auc_score(y_test, y_pred_bag)

print('Accuracy: ', acc_bag,'\nF1 Score: ', f1_bag, '\nAUC(ROC): ', roc_bag)
print("Classification Report: ")
print(classification_report(y_test, y_pred_bag))

Achieving perfect scores on the training data with a complex model like a Random Forest, especially with a high number of estimators, is a strong indication of overfitting. The model is likely memorizing the training data rather than learning generalizable patterns. To address this, **tuning hyperparameters** (Use techniques like cross-validation and GridSearchCV to find optimal hyperparameters.) to improve generalization

## Model_5: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1 ,random_state=42)
rf_clf.fit(x_train, y_train)

In [None]:
# Trainng data 
acc_train_rfclf = accuracy_score(y_train, rf_clf.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_rfclf)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test,  rf_clf.predict(x_test)))
y_pred_rfclf = rf_clf.predict(x_test)

acc_rfclf = accuracy_score(y_test, y_pred_rfclf)
f1_rfclf = f1_score(y_test, y_pred_rfclf, average='weighted')
roc_rfclf = roc_auc_score(y_test, y_pred_rfclf)

print('Accuracy: ', acc_rfclf,'\nF1 Score: ', f1_rfclf, '\nAUC(ROC): ', roc_rfclf)

### Grid Search
Achieving perfect scores on the training data with a complex model like a Random Forest, especially with a high number of estimators, is a strong indication of overfitting. The model is likely memorizing the training data rather than learning generalizable patterns. To address this, **tuning hyperparameters** (Use techniques like cross-validation and GridSearchCV to find optimal hyperparameters.) to improve generalization

In [None]:
from sklearn.model_selection import GridSearchCV
rf_clf_grid = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42)
params_grid = {"max_features" : [4,5,6,7,8,9,10],
              "min_samples_split": [2, 3, 6, 10],}
grid_search = GridSearchCV(rf_clf_grid, params_grid,
                           n_jobs=-1, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)
grid_search.best_params_
cvrf_clf = grid_search.best_estimator_

# Trainng data 
acc_train_cvrf_clf = accuracy_score(y_train, cvrf_clf.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_cvrf_clf)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test,  cvrf_clf.predict(x_test)))
y_pred_cvrf_clf = cvrf_clf.predict(x_test)

acc_cvrf_clf = accuracy_score(y_test, y_pred_cvrf_clf)
f1_cvrf_clf = f1_score(y_test, y_pred_cvrf_clf, average='weighted')
roc_cvrf_clf = roc_auc_score(y_test, y_pred_cvrf_clf)

print('Accuracy: ', acc_cvrf_clf,'\nF1 Score: ', f1_cvrf_clf, '\nAUC(ROC): ', roc_cvrf_clf)
print("Classification Report: ")
print(classification_report(y_test, y_pred_cvrf_clf))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [10, 20, 30, None],
    'max_features' : [4,5,7,10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(x_train, y_train)

# Best model from grid search
print(grid_search.best_params_)
best_rf_clf = grid_search.best_estimator_

# Training data metrics
acc_train_cvrfclf1 = accuracy_score(y_train, best_rf_clf.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_cvrfclf1)

# Test data metrics
y_test_pred_cvrfclf = best_rf_clf.predict(x_test)

acc_test_cvrfclf1 = accuracy_score(y_test, y_test_pred_cvrfclf)
f1_test_cvrfclf1 = f1_score(y_test, y_test_pred_cvrfclf, average='weighted')
roc_test_cvrfclf1 = roc_auc_score(y_test, y_test_pred_cvrfclf)
test_class_report = classification_report(y_test, y_test_pred_cvrfclf)

print("\nTest Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test,   best_rf_clf.predict(x_test)))
print('Accuracy: ', acc_test_rfclf1,'\nF1 Score: ', f1_test_rfclf1,'\AUC(ROC): ', roc_test_rfclf1)
print("Classification Report:\n", test_class_report)


## Model_6: Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_clf = GradientBoostingClassifier(n_estimators=1000,
                                     max_features=6,
                                     min_samples_split=2,
                                     random_state=42)
gbc_clf.fit(x_train, y_train)

In [None]:
# Trainng data 
acc_train_gbc_clf = accuracy_score(y_train, gbc_clf.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_gbc_clf)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test,   gbc_clf.predict(x_test)))
y_pred_gbc_clf = gbc_clf.predict(x_test)

acc_gbc_clf = accuracy_score(y_test, y_pred_gbc_clf)
f1_gbc_clf = f1_score(y_test, y_pred_gbc_clf, average='weighted')
roc_gbc_clf = roc_auc_score(y_test, y_pred_gbc_clf)

print('Accuracy: ', acc_gbc_clf,'\nF1 Score: ', f1_gbc_clf, '\nAUC(ROC): ', roc_gbc_clf)
print("Classification Report: ")
print(classification_report(y_test, y_pred_gbc_clf))

### GB Using Grid Search

In [None]:
gbc_clf_grid = GradientBoostingClassifier(loss='log_loss',
                                          criterion='friedman_mse',
                                          random_state=42)
params_grid_gbc = {"learning_rate": np.arange(0.01,0.11,0.01),
                   "n_estimators" : [500,700,1000],
                   "max_depth": [1,2,3,4,5],}
grid_search_gbc = GridSearchCV(gbc_clf_grid, params_grid_gbc,
                           n_jobs=-1, cv=5, scoring='accuracy')
grid_search_gbc.fit(x_train, y_train)
print('Best Params: ', grid_search_gbc.best_params_)
cvgbc_clf = grid_search_gbc.best_estimator_


# Trainng data 
acc_train_cvgbc_clf = accuracy_score(y_train, cvgbc_clf.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_cvgbc_clf)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test,   cvgbc_clf.predict(x_test)))
y_pred_cvgbc_clf = cvgbc_clf.predict(x_test)

acc_cvgbc_clf = accuracy_score(y_test, y_pred_cvgbc_clf)
f1_cvgbc_clf = f1_score(y_test, y_pred_cvgbc_clf, average='weighted')
roc_cvgbc_clf = roc_auc_score(y_test, y_pred_cvgbc_clf)

print('Accuracy: ', acc_cvgbc_clf,'\nF1 Score: ', f1_cvgbc_clf, '\nAUC(ROC): ', roc_cvgbc_clf)
print("Classification Report: ")
print(classification_report(y_test, y_pred_cvgbc_clf))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np

# Define GradientBoostingClassifier with initial parameters
gbc_clf_grid = GradientBoostingClassifier(loss='log_loss', criterion='friedman_mse', random_state=42)

params_grid_gbc = {
    "learning_rate": np.arange(0.01, 0.05, 0.01),  # Lower learning rate
    "n_estimators": [100, 200, 300],  # Moderate number of estimators
    "max_depth": [1, 2],  # Further reduced depth
    "min_samples_split": [5, 10, 15],  # Higher min_samples_split
    "min_samples_leaf": [4, 6, 8],  # Higher min_samples_leaf
    "subsample": [0.8, 0.9, 1.0],  # Subsample
    "max_features": ['sqrt', 'log2', None]  # Regularization using max_features
}

# Implement GridSearchCV with cross-validation
grid_search_gbc = GridSearchCV(gbc_clf_grid, params_grid_gbc, n_jobs=-1, cv=5, scoring='accuracy')
grid_search_gbc.fit(x_train, y_train)

print('Best Params: ', grid_search_gbc.best_params_)
cvgbc_clf2 = grid_search_gbc.best_estimator_

# Training data metrics
y_train_pred_cvgbc_clf2 = cvgbc_clf2.predict(x_train)
acc_train_cvgbc_clf2 = accuracy_score(y_train, y_train_pred_cvgbc_clf2)
print("Training Data Metrics:")
print('Accuracy: ', acc_train_cvgbc_clf2)

# Test data metrics
y_test_pred_cvgbc_clf2 = cvgbc_clf2.predict(x_test)
acc_test_cvgbc_clf2 = accuracy_score(y_test, y_test_pred_cvgbc_clf2)
f1_test_cvgbc_clf2 = f1_score(y_test, y_test_pred_cvgbc_clf2, average='weighted')
roc_test_cvgbc_clf2 = roc_auc_score(y_test, y_test_pred_cvgbc_clf2)

print("\nTest Data Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_cvgbc_clf2))
print('Accuracy: ', acc_test_cvgbc_clf2, '\nF1 Score: ', f1_test_cvgbc_clf2, '\nAUC(ROC): ', roc_test_cvgbc_clf2)
print("Classification Report:\n", classification_report(y_test, y_test_pred_cvgbc_clf2))


## Model_7: Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(learning_rate =0.02, n_estimators =5000)
ada_clf.fit(x_train, y_train)

In [None]:
# Trainng data 
acc_train_ada_clf = accuracy_score(y_train, ada_clf.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_ada_clf)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test,  ada_clf.predict(x_test)))
y_pred_ada_clf = ada_clf.predict(x_test)

acc_ada_clf = accuracy_score(y_test, y_pred_ada_clf)
f1_ada_clf = f1_score(y_test, y_pred_ada_clf, average='weighted')
roc_ada_clf = roc_auc_score(y_test, y_pred_ada_clf)

print('Accuracy: ', acc_ada_clf,'\nF1 Score: ', f1_ada_clf, '\nAUC(ROC): ', roc_ada_clf)
print("Classification Report: ")
print(classification_report(y_test, y_pred_ada_clf))

### Using grid search

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Define the base estimator
base_estimator = DecisionTreeClassifier(max_depth=2)  # You can experiment with different depths

# Define the AdaBoost model with the new base estimator
ada_clf_grid = AdaBoostClassifier(base_estimator=base_estimator, random_state=42)

# Reduce the number of estimators
params_grid_ada = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.001, 0.01, 0.02, 0.05, 0.1]
}

# Perform grid search
grid_search_ada = GridSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), random_state=42), 
                               params_grid_ada, n_jobs=-1, cv=5, scoring='accuracy')
grid_search_ada.fit(x_train, y_train)

# Get the best model
print('Best Params: ', grid_search_ada.best_params_)
cv_ada_clf = grid_search_ada.best_estimator_

# Training data metrics
acc_train_cv_ada_clf = accuracy_score(y_train, cv_ada_clf.predict(x_train))

print("Training Data Metrics:")
print('Accuracy: ', acc_train_cv_ada_clf)


# Test data metrics
y_test_pred_cv_ada_clf = cv_ada_clf.predict(x_test)
acc_test_cv_ada_clf = accuracy_score(y_test, y_test_pred_cv_ada_clf)
f1_test_cv_ada_clf = f1_score(y_test, y_test_pred_cv_ada_clf, average='weighted')
roc_test_cv_ada_clf = roc_auc_score(y_test, y_test_pred_cv_ada_clf)

print("\nTest Data Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred_cv_ada_clf))
print('Accuracy: ', acc_test_cv_ada_clf,'\nF1 Score: ', f1_test_cv_ada_clf,'\nAUC(ROC): ', roc_test_cv_ada_clf)
print("Classification Report:\n", classification_report(y_test, y_test_pred_cv_ada_clf))


## Model_08: XG Boost

In [None]:
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(max_depth=5, n_estimators=10000, learning_rate=0.3,
                            n_jobs=-1, random_state=42)
xgb_clf.fit(x_train, y_train)

In [None]:
# Trainng data 
acc_train_xgb_clf = accuracy_score(y_train, xgb_clf.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_xgb_clf)

# Testing data 
print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test,  xgb_clf.predict(x_test)))
y_pred_xgb_clf = xgb_clf.predict(x_test)

acc_xgb_clf = accuracy_score(y_test, y_pred_xgb_clf)
f1_xgb_clf = f1_score(y_test, y_pred_xgb_clf, average='weighted')
roc_xgb_clf = roc_auc_score(y_test, y_pred_xgb_clf)

print('Accuracy: ', acc_xgb_clf,'\nF1 Score: ', f1_xgb_clf, '\nAUC(ROC): ', roc_xgb_clf)
print("Classification Report: ")
print(classification_report(y_test, y_pred_xgb_clf))

In [None]:
# Define the parameter grid for GridSearch
param_grid = {
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize the model
xgb_clf = xgb.XGBClassifier(n_jobs=-1, random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Best model from GridSearchCV
best_xgb_clf = grid_search.best_estimator_
print('Best Params: ', grid_search.best_params_)

# Train the best model
best_xgb_clf.fit(x_train, y_train)

# Training data metrics
acc_train_cvxgb_clf = accuracy_score(y_train, best_xgb_clf.predict(x_train))
print("Training Data Metrics:")
print('Accuracy: ', acc_train_cvxgb_clf)


# Test data metrics
y_pred_cvxgb_clf = best_xgb_clf.predict(x_test)
acc_cvxgb_clf = accuracy_score(y_test, y_pred_cvxgb_clf)
f1_cvxgb_clf = f1_score(y_test, y_pred_cvxgb_clf, average='weighted')
roc_cvxgb_clf = roc_auc_score(y_test, y_pred_cvxgb_clf)

print("\nTesting Data Metrics:")
print("Confusion Matrix:\n",confusion_matrix(y_test,  best_xgb_clf.predict(x_test)))
print('Accuracy: ', acc_cvxgb_clf,'\nF1 Score: ', f1_cvxgb_clf,'\nAUC(ROC): ', roc_cvxgb_clf)
print("Classification Report:\n",classification_report(y_test, y_pred_xgb_clf))


# Model Comparison

In [None]:
# Sample data (replace these with your actual data)
train_accuracies = [None, acc_train_lrs, None, acc_train_knn1, acc_train_knn3, acc_train_knnop, acc_train_clftree, acc_train_clftree2, acc_train_bag, acc_train_rfclf, acc_train_cvrf_clf, acc_train_cvrfclf1, acc_train_gbc_clf, acc_train_cvgbc_clf, acc_train_cvgbc_clf2, acc_train_ada_clf, acc_train_cv_ada_clf, acc_train_xgb_clf, acc_train_cvxgb_clf]
test_accuracies = [acc_lrs, acc_lrs_AT1, acc_lda, acc_knn1, acc_knn3, acc_knnop, acc_clftree, acc_clftree2, acc_bag, acc_rfclf, acc_cvrf_clf, acc_test_cvrfclf1, acc_gbc_clf, acc_cvgbc_clf, acc_test_cvgbc_clf2, acc_ada_clf, acc_test_cv_ada_clf, acc_xgb_clf, acc_cvxgb_clf]
f_score = [f1_lrs, f1_lrs_AT1, f1_lda, f1_knn1, f1_knn3, f1_knnop, f1_clftree, f1_clftree2, f1_bag, f1_rfclf, f1_cvrf_clf, f1_test_cvrfclf1, f1_gbc_clf, f1_cvgbc_clf, f1_test_cvgbc_clf2, f1_ada_clf, f1_test_cv_ada_clf, f1_xgb_clf, f1_cvxgb_clf]
roc_auc = [roc_lrs, roc_lrs_AT1, roc_lda, roc_knn1, roc_knn3, roc_knnop, roc_clftree, roc_clftree2, roc_bag, roc_rfclf, roc_cvrf_clf, roc_test_cvrfclf1, roc_gbc_clf, roc_cvgbc_clf, roc_test_cvgbc_clf2, roc_ada_clf, roc_test_cv_ada_clf, roc_xgb_clf, roc_cvxgb_clf]
model_types = ['LogisticReg', 'LogisticReg_grid', 'LDA', 'KNN_1', 'KNN_2', 'KNN_Grid', 'DecisionTree', 'DecisionTree_2', 'DecisionTree_Bagging', 'RandomForest', 'RandomForest_Grid1', 'RandomForest_Grid2', 'GradientBoosting_1', 'GradientBoosting_Grid', 'GradientBoosting_Grid2', 'AdaBoost_1', 'AdaBoost_grid', 'XGB', 'XGB_Grid']

# Create DataFrame
final_df = pd.DataFrame({
    "Model Type": model_types,
    "Train_Accuracies": train_accuracies,
    "Test_Accuracies": test_accuracies,
    "F1 Scores": f_score,
    "ROC AUC": roc_auc
})

# Convert None to np.nan for sorting purposes
final_df['Train_Accuracies'] = final_df['Train_Accuracies'].apply(lambda x: float(x) if x is not None else None)

# Create a boolean column to mark if Train_Accuracies is equal to 1
final_df['Train_Acc_Not_1'] = final_df['Train_Accuracies'] != 1.0

# Sort by the boolean column and then by Test_Accuracies in descending order
final_df_s = final_df.sort_values(by=['Train_Acc_Not_1', 'Test_Accuracies'], ascending=[False, False])

# Drop the boolean column as it's no longer needed
final_df_s = final_df_s.drop(columns=['Train_Acc_Not_1'])

# Round the values
final_df_s = final_df_s.round(4)

# Create subplots
fig, ax = plt.subplots(1, 2, figsize=(18, 9))

# Table plot
ax[0].axis('off')
table = ax[0].table(cellText=final_df_s.values, colLabels=final_df_s.columns, cellLoc='center', loc='center', colWidths=[0.27, 0.20, 0.19, 0.15, 0.15])

# Change background color
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.0, 2.25)

# Set table colors
for (i, j), cell in table.get_celld().items():
    cell.set_edgecolor('black')
    if i == 0:  # Header
        cell.set_text_props(weight='bold', color='white')
        cell.set_facecolor('black')
    else:
        cell.set_facecolor('lightgrey')
        cell.set_text_props(color='black')

# Add title above the table
ax[0].text(0.5, 1.1, 'Sorted Dataframe by Test Accuracy (Descending):', horizontalalignment='center', verticalalignment='center', transform=ax[0].transAxes, fontsize=16, fontweight='bold')

# Bar plot
sns.barplot(x="Test_Accuracies", y="Model Type", data=final_df_s, palette="viridis", ax=ax[1])
ax[1].set_xlim(0.7, 0.99)
ax[1].set_xlabel("Accuracies", fontsize=10, fontweight='bold')
ax[1].set_ylabel("Model Type", fontsize=10, fontweight='bold')
ax[1].set_title("Model Type vs Accuracies", horizontalalignment='center', verticalalignment='center', transform=ax[1].transAxes, fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()


# Out Put of the Best Model

In [None]:
column_names = dummy_candidates_df_reduced.columns

# print the column names
print(column_names)

In [None]:
dummy_candidates_df_reduced.describe()

In [None]:
import numpy as np

# Assuming the model is stored in rf_classifier_tuned

def input_features():
    feature_names = [
        'CRIMINAL CASES', 'AGE', 'ASSETS', 'LIABILITIES', 'OVER TOTAL ELECTORS IN CONSTITUENCY', 
        'STATE_Bihar', 'STATE_Madhya Pradesh', 'STATE_Maharashtra', 'STATE_Tamil Nadu', 'STATE_Uttar Pradesh', 
        'STATE_West Bengal', 'PARTY_AITC', 'PARTY_BJP', 'PARTY_BSP', 'PARTY_CPI(M)', 'PARTY_DMK', 'PARTY_INC', 
        'PARTY_IND', 'PARTY_JD(U)', 'PARTY_JnP', 'PARTY_MNM', 'PARTY_NCP', 'PARTY_NTK', 'PARTY_RJD', 'PARTY_SBSP', 
        'PARTY_SHS', 'PARTY_SP', 'PARTY_TDP', 'PARTY_VBA', 'PARTY_YSRCP', 'GENDER_MALE', 'CATEGORY_SC', 'CATEGORY_ST', 
        'Education Group_Primary Education', 'Education Group_Standard Education'
    ]
    
    feature_ranges = {
        'CRIMINAL CASES': (0.0, 28.0), 'AGE': (25.0, 86.0), 'ASSETS': (0.0, 227574800.0), 'LIABILITIES': (0.0, 15666280.0), 
        'OVER TOTAL ELECTORS IN CONSTITUENCY': (0.489755, 51.456884), 'STATE_Bihar': (0, 1), 'STATE_Madhya Pradesh': (0, 1), 
        'STATE_Maharashtra': (0, 1), 'STATE_Tamil Nadu': (0, 1), 'STATE_Uttar Pradesh': (0, 1), 'STATE_West Bengal': (0, 1), 
        'PARTY_AITC': (0, 1), 'PARTY_BJP': (0, 1), 'PARTY_BSP': (0, 1), 'PARTY_CPI(M)': (0, 1), 'PARTY_DMK': (0, 1), 
        'PARTY_INC': (0, 1), 'PARTY_IND': (0, 1), 'PARTY_JD(U)': (0, 1), 'PARTY_JnP': (0, 1), 'PARTY_MNM': (0, 1), 
        'PARTY_NCP': (0, 1), 'PARTY_NTK': (0, 1), 'PARTY_RJD': (0, 1), 'PARTY_SBSP': (0, 1), 'PARTY_SHS': (0, 1), 
        'PARTY_SP': (0, 1), 'PARTY_TDP': (0, 1), 'PARTY_VBA': (0, 1), 'PARTY_YSRCP': (0, 1), 'GENDER_MALE': (0, 1), 
        'CATEGORY_SC': (0, 1), 'CATEGORY_ST': (0, 1), 'Education Group_Primary Education': (0, 1), 
        'Education Group_Standard Education': (0, 1)
    }
    
    features = []
    print("Please enter the candidate details:")
    for feature in feature_names:
        min_val, max_val = feature_ranges[feature]
        value = float(input(f"{feature} (min: {min_val}, max: {max_val}): "))
        
        # Ensuring the value is within the range
        if value < min_val or value > max_val:
            print(f"Warning: {feature} value should be between {min_val} and {max_val}.")
        
        features.append(value)
    
    return features

def predict_winner(model, features):
    features_array = np.array([features])  # Convert list to 2D array to match the input shape for prediction
    prediction = model.predict(features_array)
    return prediction[0]

# Example: Assuming the model is already trained and stored in ADA boostig Grid search
features = input_features()
predicted_winner = predict_winner(cv_ada_clf, features)
print(f"The prediction for WINNER is: {'Yes' if predicted_winner == 1 else 'No'}")
