In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Loading the dataset
df = pd.read_excel('dump.xlsx')

In [7]:
# Removing leads with STATUS other than WON or LOST
df = df[(df['status'] == 'WON') | (df['status'] == 'LOST')]

# Removing the 'lead_id' and 'Agent_id' columns
df = df.drop(['lead_id', 'Agent_id'], axis=1)

# Converting all columns to categorical columns
df = df.astype('category')

In [8]:
# Replacing all occurrences of '9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0' with NaN
df = df.replace('9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0', pd.NaT)


In [16]:
# Extracting numerical values from 'budget' and 'lease' columns
df['budget'] = df['budget'].str.extract('(\d+)', expand=False)
df['lease'] = df['lease'].str.extract('(\d+)', expand=False)

df.head()

Unnamed: 0.1,Unnamed: 0,status,lost_reason,budget,lease,movein,source,source_city,source_country,utm_source,utm_medium,des_city,des_country,room_type
0,0.0,LOST,Not responding,,,NaT,,,,3d59f7548e1af2151b64135003ce63c0a484c26b9b8b16...,268ad70eb5bc4737a2ae28162cbca30118cc94520e49ef...,ecc0e7dc084f141b29479058967d0bc07dee25d9690a98...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,
1,1.0,LOST,Low budget,,,NaT,,,,3d59f7548e1af2151b64135003ce63c0a484c26b9b8b16...,268ad70eb5bc4737a2ae28162cbca30118cc94520e49ef...,5372372f3bf5896820cb2819300c3e681820d82c6efc54...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,
2,2.0,LOST,Not responding,121.0,40.0,2022-08-31,7aae3e886e89fc1187a5c47d6cea1c22998ee610ade1f2...,9b8cc3c63cdf447e463c11544924bf027945cbd29675f7...,e09e10e67812e9d236ad900e5d46b4308fc62f5d69446a...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,11ab03a1a8c367191355c152f39fe28cae5e426fce49ef...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Ensuite
3,3.0,LOST,Low budget,0.0,,NaT,ba2d0a29556ac20f86f45e4543c0825428cba33fd7a9ea...,a5f0d2d08eb0592087e3a3a2f9c1ba2c67cc30f2efd2bd...,e09e10e67812e9d236ad900e5d46b4308fc62f5d69446a...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,19372fa44c57a01c37a5a8418779ca3d99b0b59731fb35...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,
4,4.0,LOST,Junk lead,,,NaT,,,,3d59f7548e1af2151b64135003ce63c0a484c26b9b8b16...,268ad70eb5bc4737a2ae28162cbca30118cc94520e49ef...,,,


In [25]:
from sklearn.preprocessing import LabelEncoder
# Treat all columns as categorical
df = df.astype(str)

# Encode the categorical features
encoder = LabelEncoder()
for col in df.columns:
    df[col] = encoder.fit_transform(df[col])


In [26]:
# Split the dataset into training and testing sets
X = df.drop(['status'], axis=1)
y = df['status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Training the machine learning model
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [32]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average='macro')
rec = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print('Accuracy:', acc)
print('Precision:', prec)
print('Recall:', rec)
print('F1-score:', f1)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0
