In [1]:
# Importing the Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Loading the Dataset
df = pd.read_csv('lead.csv')

#### Data Cleaning and Feature Selection


In [3]:
df = df[df.status.isin(['WON', 'LOST'])]
df = df.replace('9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0', np.nan)
df = df.drop(columns=["Unnamed: 0", "Agent_id", "movein","lease", "source", "source_city", "source_country", "utm_source","utm_medium","des_city","des_country","room_type","lead_id"])
df = df.fillna(df.mode().iloc[0])


#### Encoding Categorical Variables


In [4]:
le = LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])


#### Splitting the data into training and testing sets


In [5]:
X = df.drop('status', axis=1)
y = df['status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


#### Training the Model


In [6]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#### Prediction for Test Data

In [7]:
y_pred = model.predict(X_test)

# Evaluating the Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1-Score:",f1)

Accuracy:  0.9412061024755325
Precision:  0.6228571428571429
Recall:  0.24141749723145073
F1-Score: 0.3479648842777334


#### Prediction for complete data

In [8]:
y_pred = model.predict(X)

# Evaluating the Performance
accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)

print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1-Score:",f1)

Accuracy:  0.9454843793855388
Precision:  0.732597623089983
Recall:  0.28083306215424664
F1-Score: 0.4060221124441307


#### Hyperparameter Tuning:
If the performance of the model is not satisfactory, we can perform hyperparameter tuning by using Grid Search or Randomized Search to find the optimal hyperparameters.

In [9]:

from sklearn.model_selection import GridSearchCV

# Define the grid search parameters
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [2, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best Score: 0.9390210049042288


In [10]:
# Train the model on the best parameters obtained from the grid search
best_model = RandomForestClassifier(n_estimators=grid_search.best_params_['n_estimators'], 
                                     max_depth=grid_search.best_params_['max_depth'], 
                                     min_samples_split=grid_search.best_params_['min_samples_split'], 
                                     min_samples_leaf=grid_search.best_params_['min_samples_leaf'], 
                                     random_state=42)

best_model.fit(X_train, y_train)

In [11]:
# Testing the Model
y_pred = best_model.predict(X_test)

# Evaluating the Performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1-Score:",f1)

Accuracy:  0.9412061024755325
Precision:  0.7443181818181818
Recall:  0.1450719822812846
F1-Score: 0.24281742354031507
