In [36]:
import pandas as pd
data = pd.read_csv("train.csv")

In [37]:
def convert_to_lakhs(value):
    value = str(value)  # Convert value to string
    if value == "0":  # Handle cases where value is '0'
        return 0
    elif 'Crore+' in value:
        number, unit = value.split()
        number = float(number.replace(",", ""))  # Remove comma from numbers
        return int(number * 100)  # Convert Crore+ to Lakhs
    elif 'Lac+' in value:
        number, unit = value.split()
        number = float(number.replace(",", ""))  # Remove comma from numbers
        return int(number)  # Return the number as it is
    elif 'Thou+' in value:
        number, unit = value.split()
        number = float(number.replace(",", ""))  # Remove comma from numbers
        return int(number / 100)  # Convert Thou+ to Lakhs
    elif 'Hund+' in value:
        number, unit = value.split()
        number = float(number.replace(",", ""))  # Remove comma from numbers
        return int(number / 1000)  # Convert Thou+ to Lakhs
    else:
        return int(value)  # Return the number as it is

In [38]:
data["Total Assets"] = data["Total Assets"].apply(convert_to_lakhs)
data["Liabilities"] = data["Liabilities"].apply(convert_to_lakhs)

data

Unnamed: 0,ID,Candidate,Constituency ∇,Party,Criminal Case,Total Assets,Liabilities,state,Education
0,0,M.K. Mohan,ANNA NAGAR,DMK,4,21100,200,TAMIL NADU,8th Pass
1,1,Khatik Ramesh Prasad,KARERA (SC),BJP,0,100,0,MADHYA PRADESH,12th Pass
2,2,Dr. Mantar Gowda,MADIKERI,INC,0,700,22,KARNATAKA,Post Graduate
3,3,Kundan Kumar,BEGUSARAI,BJP,0,900,24,BIHAR,Post Graduate
4,4,Swapan Majumder,BANGAON DAKSHIN (SC),BJP,2,200,61,WEST BENGAL,8th Pass
...,...,...,...,...,...,...,...,...,...
2054,2054,V. Sasi,CHIRAYINKEEZHU,CPI,1,61,10,KERALA,Graduate Professional
2055,2055,Pushkar Lal Dangi,MAVLI,INC,0,200,8,RAJASTHAN,10th Pass
2056,2056,Dr. Manju Shiwach,MODI NAGAR,BJP,0,1300,85,UTTAR PRADESH,Graduate
2057,2057,Mansing Fattesingrao Naik,SHIRALA,NCP,1,2500,94,MAHARASHTRA,12th Pass


In [39]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

encoded_column = label_encoder.fit_transform(data['Education'])

data['Education'] = encoded_column

categorical_columns = ["Candidate", "Constituency ∇", "Party", "state"]

encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data[categorical_columns])

pca = PCA(n_components=4)
embedding = pca.fit_transform(encoded_data.toarray())

embedding_df = pd.DataFrame(embedding, columns=[f"{column}_embedding_{i+1}" for i, column in enumerate(categorical_columns)])
data_with_embedding = pd.concat([data, embedding_df], axis=1)

data_with_embedding.drop(columns=categorical_columns, inplace=True)

data_with_embedding

Unnamed: 0,ID,Criminal Case,Total Assets,Liabilities,Education,Candidate_embedding_1,Constituency ∇_embedding_2,Party_embedding_3,state_embedding_4
0,0,4,21100,200,3,0.406400,0.310449,-0.476431,-0.256180
1,1,0,100,0,1,-0.644126,-0.206810,0.192647,-0.250256
2,2,0,700,22,9,0.778389,-0.750674,0.299950,0.205060
3,3,0,900,24,9,-0.578171,-0.040020,0.008690,-0.239450
4,4,2,200,61,3,-0.528794,0.375374,0.624886,0.021810
...,...,...,...,...,...,...,...,...,...
2054,2054,1,61,10,6,0.318661,0.171625,-0.178761,-0.065696
2055,2055,0,200,8,0,0.674872,-0.621277,0.237316,0.100766
2056,2056,0,1300,85,5,-0.766007,-0.052041,-0.252275,0.687853
2057,2057,1,2500,94,1,0.311347,0.148433,-0.221261,-0.263265


In [40]:
data_with_embedding.drop(columns=['ID'], inplace=True)

data_with_embedding.drop(columns=['Liabilities'], inplace=True)

y = data_with_embedding["Education"]

X = data_with_embedding
X.drop(columns = ['Education'],inplace = True)

print(X)
print(y)

      Criminal Case  Total Assets  Candidate_embedding_1  \
0                 4         21100               0.406400   
1                 0           100              -0.644126   
2                 0           700               0.778389   
3                 0           900              -0.578171   
4                 2           200              -0.528794   
...             ...           ...                    ...   
2054              1            61               0.318661   
2055              0           200               0.674872   
2056              0          1300              -0.766007   
2057              1          2500               0.311347   
2058              0            11              -0.596745   

      Constituency ∇_embedding_2  Party_embedding_3  state_embedding_4  
0                       0.310449          -0.476431          -0.256180  
1                      -0.206810           0.192647          -0.250256  
2                      -0.750674           0.299950         

In [41]:
from sklearn.model_selection import train_test_split

X1, X_test, y1, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X1)
X_test_imputed = imputer.fit_transform(X_test)

clf = DecisionTreeClassifier()

clf.fit(X_imputed, y1)

y_pred = clf.predict(X_test_imputed)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)


F1 Score: 0.1760757532746867
