## `Importing and displaying datasets`

### <i>All imports goes here!</i>

In [16]:
import pandas as pd
from IPython.display import display
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()


### <i>Reading the data</i>

In [2]:
result_2008 = pd.read_csv('Data/Cleaned_Data/cleaned_&_final_2008_result.csv')
result_2013 = pd.read_csv('Data/Cleaned_Data/cleaned_&_final_2013_result.csv')
result_2017 = pd.read_csv('Data/Cleaned_Data/cleaned_&_final_2017_result.csv')
result_2022 = pd.read_csv('Data/Cleaned_Data/cleaned_&_final_2022_result.csv')

display(result_2008.head())
display(result_2013.head())
display(result_2017.head())
display(result_2022.head())

Unnamed: 0.1,Unnamed: 0,District,Const,WParty,WVotes,Age,TurnOut%
0,0,Achham,Achham 1,uml,2119.0358,40,54.53
1,1,Achham,Achham 2,maoist,1708.727,32,52.82
2,2,Arghakhanchi,Arghakhanchi 1,maoist,1966.7836,46,55.34
3,3,Arghakhanchi,Arghakhanchi 2,nepalicongress,1755.126,46,52.05
4,4,Baglung,Baglung 1,maoist,1828.5047,48,60.01


Unnamed: 0.1,Unnamed: 0,District,Const,WParty,WVotes,Age,TurnOut%
0,0,Taplejung,Taplejung-1,Nepal Communist Party (UML),7434.0,37.0,73.12
1,1,Taplejung,Taplejung-2,Nepal Communist Party (UML),7034.0,48.0,72.6
2,2,Panchthar,Panchthar-1,Nepal Communist Party (UML),13082.0,26.0,72.18
3,3,Panchthar,Panchthar-2,Nepali Congress,11839.0,41.0,74.58
4,4,Illam,Illam-1,Nepal Communist Party (UML),17342.0,55.0,78.58


Unnamed: 0.1,Unnamed: 0,District,Const,WParty,WVotes,Age,TurnOut%
0,0,Taplejung,Taplejung-1,Nepal Communist Party (UML),15417,46,59.060916
1,1,Taplejung,Taplejung-1,Nepali Congress,10974,51,62.215124
2,2,Taplejung,Taplejung-1,Federal Democratic National Front,861,57,54.605809
3,3,Taplejung,Taplejung-1,Naya Shakti Party Nepal,267,41,52.116411
4,4,Taplejung,Taplejung-1,Federal Socialist Forum Nepal,158,45,67.28869


Unnamed: 0.1,Unnamed: 0,District,Const,WParty,WVotes,Age,TurnOut%
0,0,Taplejung,Taplejung-1,Nepal Communist Party (UML),21943,56,49.859126
1,1,Taplejung,Taplejung-1,Nepal Communist Party (Maoist Center) (Unified...,21735,46,53.744678
2,2,Taplejung,Taplejung-1,Federal Democratic National Front,941,56,54.895159
3,3,Taplejung,Taplejung-1,"Janata Samajwadi Party, Nepal",628,49,51.052482
4,4,Taplejung,Taplejung-1,Rastriya Prajatantra Party,406,65,54.328004


## `Combining all datasets`

In [3]:
file_paths = [
    "Data/Cleaned_Data/cleaned_&_final_2008_result.csv",
    "Data/Cleaned_Data/cleaned_&_final_2013_result.csv",
    "Data/Cleaned_Data/cleaned_&_final_2017_result.csv",
    "Data/Cleaned_Data/cleaned_&_final_2022_result.csv",
]

# Combining all datasets into one DataFrame
dataframes = [pd.read_csv(file) for file in file_paths]
combined_data = pd.concat(dataframes, ignore_index=True)

# Displaying the shape of the combined dataset
print("Combined Dataset Shape:", combined_data.shape)


Combined Dataset Shape: (6165, 7)


## `Data Cleaning (if necessary)`

### <i>Checking if there are any missing values</i>

In [4]:
missing_values = combined_data.isnull().sum()
print("Missing Values:\n", missing_values)


Missing Values:
 Unnamed: 0     0
District       0
Const          0
WParty        47
WVotes        47
Age            0
TurnOut%       0
dtype: int64


### <i>Handling missing values</i>

In [5]:
# Using info to check the data type of missing value columns i.e., WParty and WVotes 
combined_data.info()
# Data type found to be oject and float64 for WParty and WVotes respectively

# For WParty which is of object datatype
for column in combined_data['WParty']:
    
    combined_data['WParty'].fillna(combined_data['WParty'].mode()[0], inplace=True)
    
# For WVoteswhich is of float datatype
for column in combined_data['WVotes']:
       
    combined_data['WVotes'].fillna(combined_data['WVotes'].median(), inplace=True)

# Checking if there are still missing values that persist
missing_values = combined_data.isnull().sum()
print("Missing Values:\n", missing_values)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6165 entries, 0 to 6164
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  6165 non-null   int64  
 1   District    6165 non-null   object 
 2   Const       6165 non-null   object 
 3   WParty      6118 non-null   object 
 4   WVotes      6118 non-null   float64
 5   Age         6165 non-null   float64
 6   TurnOut%    6165 non-null   float64
dtypes: float64(3), int64(1), object(3)
memory usage: 337.3+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['WParty'].fillna(combined_data['WParty'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['WVotes'].fillna(combined_data['WVotes'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never wo

Missing Values:
 Unnamed: 0    0
District      0
Const         0
WParty        0
WVotes        0
Age           0
TurnOut%      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['WVotes'].fillna(combined_data['WVotes'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_data['WVotes'].fillna(combined_data['WVotes'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never wor

### <i>Checking if there are any duplicate values</i>

In [6]:
duplicates = combined_data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0


### <i>Checking if there are any inconsistencies in data</i>

In [7]:
# Standardizing case and remove leading/trailing spaces
combined_data['District'] = combined_data['District'].str.lower().str.strip()

## `Feature engineering`

### <i>Encoding categorical Variables</i>

#### <i><u>Performing target variable encoding for WParty</u></i>

In [8]:
label_encoder = LabelEncoder()
combined_data['WParty_encoded'] = label_encoder.fit_transform(combined_data['WParty'])

combined_data.head()

Unnamed: 0.1,Unnamed: 0,District,Const,WParty,WVotes,Age,TurnOut%,WParty_encoded
0,0,achham,Achham 1,uml,2119.0358,40.0,54.53,97
1,1,achham,Achham 2,maoist,1708.727,32.0,52.82,95
2,2,arghakhanchi,Arghakhanchi 1,maoist,1966.7836,46.0,55.34,95
3,3,arghakhanchi,Arghakhanchi 2,nepalicongress,1755.126,46.0,52.05,96
4,4,baglung,Baglung 1,maoist,1828.5047,48.0,60.01,95


#### <i><u>Performing one-hot-encoding for District and Const</u></i>

In [9]:
combined_data = pd.get_dummies(combined_data, columns=['District'], drop_first=True)
combined_data = pd.get_dummies(combined_data, columns=['Const'], drop_first=True)

combined_data.to_csv('Data/temp_combined_data.csv')
combined_data.tail()

Unnamed: 0.1,Unnamed: 0,WParty,WVotes,Age,TurnOut%,WParty_encoded,District_arghakhanchi,District_baglung,District_baitadi,District_bajhang,...,Const_सिन्धुपाल्चोक-1,Const_सिन्धुपाल्चोक-2,Const_सिन्धुली-1,Const_सिन्धुली-2,Const_सुर्खेत-1,Const_सुर्खेत-2,Const_सोलुखुम्बु-1,Const_सोलुखुम्बु-2,Const_हुम्ला-1,Const_हुम्ला-2
6160,2442,Nepal Communist Party (UML),12961.0,50.0,56.83,47,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6161,2443,Independent,3567.0,33.0,56.83,18,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6162,2444,Rastriya Prajatantra Party,796.0,55.0,56.83,76,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6163,2445,Independent,214.0,26.0,56.83,18,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6164,2446,Mongol National Organization,208.0,45.0,56.83,35,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## `Model Implementation: Logistic Regression`

#### <i>`Step 1:` Splitting the data into features and target</i>

In [10]:
# Features (excluding the target column)
X = combined_data.drop(columns=['WParty', 'WParty_encoded'])

# Target variable
y = combined_data['WParty_encoded']

#### <i>`Step 2:` Splitting the data into training and testing sets</i>

In [11]:
# Splitting the data (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### <i>`Step 2.1:` Addressing the ValueError of y only having 1 member.</i>

In [12]:
# Filtering out classes with fewer than 2 instances
class_counts = y.value_counts()
valid_classes = class_counts[class_counts >= 2].index
filtered_data = combined_data[combined_data['WParty_encoded'].isin(valid_classes)]

# Updating features and target
X = filtered_data.drop(columns=['WParty', 'WParty_encoded'])
y = filtered_data['WParty_encoded']

#### <i>`Step 3:` Initializing and training the logistic regression model</i>

In [19]:
dtc.fit(X_train,y_train)
# combined_data.head()

#### <i>`Step 4:` Making predictions</i>

In [20]:
y_pred = dtc.predict(X_test)

#### <i>`Step 5:` Evaluating the model</i>

In [21]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         3
           4       0.17      0.21      0.19        24
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         3
           7       0.06      0.05      0.05        20
           8       0.00      0.00      0.00         1
           9       0.08      0.09      0.09        33
          10       0.14      0.13      0.13        31
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00         5
          13       0.04      0.06      0.04        33
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         1
          16       0.11      0.08      0.10        24
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
# Checking the importance of features

features = pd.DataFrame(dtc.feature_importances_, index=X.columns)
features.head(15)

Unnamed: 0,0
Unnamed: 0,0.160744
WVotes,0.296754
Age,0.127645
TurnOut%,0.146849
District_arghakhanchi,0.001083
District_baglung,0.0
District_baitadi,0.000989
District_bajhang,0.000422
District_bajura,0.0
District_banke,0.0


In [47]:
# Solving overfitting of data

dtc2 = DecisionTreeClassifier(criterion='entropy',ccp_alpha=0.008)

dtc2.fit(X_train,y_train)
y_pred2 = dtc2.predict(X_test)

In [48]:
# Checking the classification report again

print(classification_report(y_pred2,y_test))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.07      0.10      0.08        20
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.03      0.06      0.04        17
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         0
          18       0.89      0.48      0.62      1015
          19       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
