In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
# Load training data
train_data = pd.read_csv("train.csv")

# Handle missing values if any
train_data.fillna(0, inplace=True)  # Filling missing values with 0

# Define a function to convert string representations of assets and liabilities to numeric format
def convert_to_numeric(s):
    if 'Crore' in s:
        return float(s.replace(' Crore+', ''))
    elif 'Lac' in s:
        return float(s.replace(' Lac+', '')) / 100
    elif 'Thou' in s:
        return float(s.replace(' Thou+', '')) / 10000
    elif 'Hund' in s:
        return float(s.replace(' Hund+', '')) / 100000
    else:
        return float(s)

# Apply the function to the 'Total Assets' and 'Liabilities' columns
train_data['Total Assets'] = train_data['Total Assets'].apply(convert_to_numeric)
train_data['Liabilities'] = train_data['Liabilities'].apply(convert_to_numeric)


# Encoding categorical variables
encoder = OneHotEncoder()
X_train = train_data[['Criminal Case', 'Total Assets', 'Liabilities']]
y_train = train_data['Education']

# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initializing and training the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Convert feature names to strings
X_train.columns = X_train.columns.astype(str)

rf_classifier.fit(X_train, y_train)

# Convert column names to strings
X_val.columns = X_val.columns.astype(str)

# Evaluating the model
y_pred = rf_classifier.predict(X_val)
print(classification_report(y_val, y_pred))

# Predicting on test data
test_data = pd.read_csv("test.csv")
test_data.fillna(0, inplace=True)
X_test = test_data[['Criminal Case', 'Total Assets', 'Liabilities']]

# Apply the conversion function to the 'Total Assets' and 'Liabilities' columns in the test data
X_test['Total Assets'] = X_test['Total Assets'].apply(convert_to_numeric)
X_test['Liabilities'] = X_test['Liabilities'].apply(convert_to_numeric)

# Predicting on test data
test_predictions = rf_classifier.predict(X_test)

# print(test_data)

predictions_df = pd.DataFrame({'ID': test_data['ID'], 'education level': test_predictions})

predictions_df.to_csv('predictions2.csv', index=False)



                       precision    recall  f1-score   support

            10th Pass       0.22      0.16      0.19        49
            12th Pass       0.08      0.07      0.08        67
             5th Pass       0.00      0.00      0.00         1
             8th Pass       0.12      0.10      0.11        20
            Doctorate       0.00      0.00      0.00        14
             Graduate       0.29      0.33      0.31       108
Graduate Professional       0.19      0.17      0.18        66
             Literate       0.00      0.00      0.00         3
               Others       0.00      0.00      0.00         4
        Post Graduate       0.23      0.21      0.22        80

             accuracy                           0.19       412
            macro avg       0.11      0.11      0.11       412
         weighted avg       0.19      0.19      0.19       412



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Total Assets'] = X_test['Total Assets'].apply(convert_to_numeric)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Liabilities'] = X_test['Liabilities'].apply(convert_to_numeric)
