In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
def evaluate_model(model, x_train, y_train, x_test, y_test):
    model = model.fit(x_train, y_train)
    predict_train_y = model.predict(x_train)
    predict_test_y = model.predict(x_test)

    print("**Accuracy Score**")
    train_accuracy = accuracy_score(y_train, predict_train_y)
    test_accuracy = accuracy_score(y_test, predict_test_y)
    print("Train Accuracy is: %s" % (train_accuracy))
    print("\nTest Accuracy is: %s" % (test_accuracy))
    print("---------------------------------------------------------")

    print("\n**Accuracy Error**")
    train_error = (1 - train_accuracy)
    test_error = (1 - test_accuracy)
    print("Train Error: %s" % (train_error))
    print("\nTest Error: %s" % (test_error))
    print("---------------------------------------------------------")

    print("\n**Classification Report**")
    train_cf_report = pd.DataFrame(classification_report(y_train, predict_train_y, output_dict=True))
    test_cf_report = pd.DataFrame(classification_report(y_test, predict_test_y, output_dict=True))
    print("Train Classification Report:")
    print(train_cf_report)
    print("\n Test Classification Report:")
    print(test_cf_report)
    print("---------------------------------------------------------")

    print("\n**Confusion Matrix**")
    train_conf = confusion_matrix(y_train, predict_train_y)
    test_conf = confusion_matrix(y_test, predict_test_y)
    print("Train Confusion Matrix Report:")
    print((train_conf))
    print("\n Test Confusion Matrix Report:")
    print((test_conf))

In [3]:
def FeatureSelection(model):
    model = CatBoostClassifier()
    _ = model.fit(X_train_std, y_train)
    print(model.score(X_test_std, y_test))
    featr = pd.DataFrame(
        zip(X_train.columns, abs(model.feature_importances_)),
        columns=["feature", "weight"],
    ).sort_values("weight")

    high = featr[featr["weight"] < 10].index
    low = featr.drop(high)
    return low["feature"]


In [4]:
# Reading the Dataset
df=pd.read_csv('/kaggle/input/dsicsv/Data_Science_Internship - Dump.csv')

# Dropping features which are not needed/ should not be used(lost_reason)
df.drop(['Agent_id','lead_id','lost_reason'], axis=1,inplace=True)

# Dropping rows which are fully null if any
df.dropna(how="all")

# Handling Missing Data
# Since Feature Selection will eliminate low weight features imputation of data using mode is an acceptable decision
df["budget"].fillna(df["budget"].mode(),inplace=True)
df["lease"].fillna(df["lease"].mode(),inplace=True)
df["room_type"].fillna(df["room_type"].mode(),inplace=True)
df["movein"]=df["movein"].fillna('00-00-0000') #Does not effect the model due to Feature Selection
df

Unnamed: 0.1,Unnamed: 0,status,budget,lease,movein,source,source_city,source_country,utm_source,utm_medium,des_city,des_country,room_type
0,0,LOST,£60 - £120 Per week,Full Year Course Stay 40 - 44 weeks,00-00-0000,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,3d59f7548e1af2151b64135003ce63c0a484c26b9b8b16...,268ad70eb5bc4737a2ae28162cbca30118cc94520e49ef...,ecc0e7dc084f141b29479058967d0bc07dee25d9690a98...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Ensuite
1,1,LOST,,,00-00-0000,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,3d59f7548e1af2151b64135003ce63c0a484c26b9b8b16...,268ad70eb5bc4737a2ae28162cbca30118cc94520e49ef...,5372372f3bf5896820cb2819300c3e681820d82c6efc54...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,
2,2,LOST,£121 - £180 Per Week,Full Year Course Stay 40 - 44 weeks,31/08/22,7aae3e886e89fc1187a5c47d6cea1c22998ee610ade1f2...,9b8cc3c63cdf447e463c11544924bf027945cbd29675f7...,e09e10e67812e9d236ad900e5d46b4308fc62f5d69446a...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,11ab03a1a8c367191355c152f39fe28cae5e426fce49ef...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Ensuite
3,3,LOST,0-0,0,00-00-0000,ba2d0a29556ac20f86f45e4543c0825428cba33fd7a9ea...,a5f0d2d08eb0592087e3a3a2f9c1ba2c67cc30f2efd2bd...,e09e10e67812e9d236ad900e5d46b4308fc62f5d69446a...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,19372fa44c57a01c37a5a8418779ca3d99b0b59731fb35...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,
4,4,LOST,,,00-00-0000,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,3d59f7548e1af2151b64135003ce63c0a484c26b9b8b16...,268ad70eb5bc4737a2ae28162cbca30118cc94520e49ef...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
46603,46603,LOST,£60 - £120 Per week,Complete Education Year Stay 50 - 52 weeks,01/09/22,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,7d1550b844ff586a6023216c06263105eed0a849a2a1f6...,8da82000ef9c4468ba47362a924b895e40662fed846942...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,13d4b96b6ee361b737e22404aeecca45f322ba7d4c2c42...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Studio
46604,46604,LOST,£60 - £120 Per week,Summer/Short Stay 8 - 12 weeks,29/09/22,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9f2ba6981e36ea0fca6c16f42e6413d788d2a7499b8b7f...,8da82000ef9c4468ba47362a924b895e40662fed846942...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,998c18dc0e513ab0035c7b858e3780bbbead88695a15de...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Studio
46605,46605,LOST,£241 - £300 Per week,Full Year Course Stay 40 - 44 weeks,20/09/22,7aae3e886e89fc1187a5c47d6cea1c22998ee610ade1f2...,7d1550b844ff586a6023216c06263105eed0a849a2a1f6...,8da82000ef9c4468ba47362a924b895e40662fed846942...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,ecc0e7dc084f141b29479058967d0bc07dee25d9690a98...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Studio
46606,46606,LOST,1108,294,30/08/22,d684761c17c11590f6e2525b48141cb2c0c6f2be5df4e2...,e9c722cbefc2f055ae60b4e2cbe73a2d99537eab0c37f3...,b936ee09e20b3b2234907cde349cda1c1a5327c4a486bf...,7f3fa48ca885678134842fa7456f3ece53a97f843b6101...,abb8e2badd5b6265c3237170cc599257a4f566706715d2...,ecc0e7dc084f141b29479058967d0bc07dee25d9690a98...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,


In [5]:
# Dropping rows which have status other than LOST OR WON
clean_df=pd.DataFrame()
clean_df=df[df.status!='OPPORTUNITY']
clean_df=clean_df[clean_df.status!='OPPORTUNITY']
clean_df=clean_df[clean_df.status!='CONTACTED']
clean_df=clean_df[clean_df.status!='PROCESSING']
clean_df=clean_df[clean_df.status!='IMPORTANT']
clean_df

Unnamed: 0.1,Unnamed: 0,status,budget,lease,movein,source,source_city,source_country,utm_source,utm_medium,des_city,des_country,room_type
0,0,LOST,£60 - £120 Per week,Full Year Course Stay 40 - 44 weeks,00-00-0000,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,3d59f7548e1af2151b64135003ce63c0a484c26b9b8b16...,268ad70eb5bc4737a2ae28162cbca30118cc94520e49ef...,ecc0e7dc084f141b29479058967d0bc07dee25d9690a98...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Ensuite
1,1,LOST,,,00-00-0000,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,3d59f7548e1af2151b64135003ce63c0a484c26b9b8b16...,268ad70eb5bc4737a2ae28162cbca30118cc94520e49ef...,5372372f3bf5896820cb2819300c3e681820d82c6efc54...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,
2,2,LOST,£121 - £180 Per Week,Full Year Course Stay 40 - 44 weeks,31/08/22,7aae3e886e89fc1187a5c47d6cea1c22998ee610ade1f2...,9b8cc3c63cdf447e463c11544924bf027945cbd29675f7...,e09e10e67812e9d236ad900e5d46b4308fc62f5d69446a...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,11ab03a1a8c367191355c152f39fe28cae5e426fce49ef...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Ensuite
3,3,LOST,0-0,0,00-00-0000,ba2d0a29556ac20f86f45e4543c0825428cba33fd7a9ea...,a5f0d2d08eb0592087e3a3a2f9c1ba2c67cc30f2efd2bd...,e09e10e67812e9d236ad900e5d46b4308fc62f5d69446a...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,19372fa44c57a01c37a5a8418779ca3d99b0b59731fb35...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,
4,4,LOST,,,00-00-0000,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,3d59f7548e1af2151b64135003ce63c0a484c26b9b8b16...,268ad70eb5bc4737a2ae28162cbca30118cc94520e49ef...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
46603,46603,LOST,£60 - £120 Per week,Complete Education Year Stay 50 - 52 weeks,01/09/22,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,7d1550b844ff586a6023216c06263105eed0a849a2a1f6...,8da82000ef9c4468ba47362a924b895e40662fed846942...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,13d4b96b6ee361b737e22404aeecca45f322ba7d4c2c42...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Studio
46604,46604,LOST,£60 - £120 Per week,Summer/Short Stay 8 - 12 weeks,29/09/22,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9f2ba6981e36ea0fca6c16f42e6413d788d2a7499b8b7f...,8da82000ef9c4468ba47362a924b895e40662fed846942...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,998c18dc0e513ab0035c7b858e3780bbbead88695a15de...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Studio
46605,46605,LOST,£241 - £300 Per week,Full Year Course Stay 40 - 44 weeks,20/09/22,7aae3e886e89fc1187a5c47d6cea1c22998ee610ade1f2...,7d1550b844ff586a6023216c06263105eed0a849a2a1f6...,8da82000ef9c4468ba47362a924b895e40662fed846942...,bbdefa2950f49882f295b1285d4fa9dec45fc4144bfb07...,09076eb7665d1fb9389c7c4517fee0b00e43092eb34821...,ecc0e7dc084f141b29479058967d0bc07dee25d9690a98...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,Studio
46606,46606,LOST,1108,294,30/08/22,d684761c17c11590f6e2525b48141cb2c0c6f2be5df4e2...,e9c722cbefc2f055ae60b4e2cbe73a2d99537eab0c37f3...,b936ee09e20b3b2234907cde349cda1c1a5327c4a486bf...,7f3fa48ca885678134842fa7456f3ece53a97f843b6101...,abb8e2badd5b6265c3237170cc599257a4f566706715d2...,ecc0e7dc084f141b29479058967d0bc07dee25d9690a98...,8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512...,


In [6]:
# Since the domain of dataset is of student accomodation, rather than specific dates
# month will be a more important feature
clean_df['movein']=clean_df['movein'].str.split('-', expand=True)[1]


# Label encoding the target variable
label_encoder = LabelEncoder()
clean_df["status"] = label_encoder.fit_transform(clean_df["status"])

# One-hot-encoding all the rest of the features
encoded_data = pd.get_dummies(clean_df, columns = ['budget','lease','room_type','source','source_city','source_country','utm_source','utm_medium','des_city','des_country','movein'])
encoded_data


Unnamed: 0.1,Unnamed: 0,status,budget_0-,budget_0-$100,budget_0-$100 to $150 per week,budget_0-$100/week,budget_0-$20,budget_0-$250,budget_0-$400-500CAD,budget_0-$50,...,des_country_7a1ca4ef7515f7276bae7230545829c27810c9d9e98ab2c06066bee6270d5153,des_country_80db4ccdca106d37b920206331fcfe3e9e50a9e763d89b54ce3ad5ac8cf30f03,des_country_8d23a6e37e0a6431a8f1b43a91026dcff51170a89a6512ff098eaa56a4d5fb19,des_country_8ef41e6f4b07432a0cb4eb7a8774e7a3878fd3e385f49aa09b406768467db228,des_country_9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6c9bc9d493a23be9de0,des_country_abd149214539d9f222d25de6358735b9fa0efd3956f66102b2c119ae2d9f6348,des_country_be55ef3f4c4e6c2d9c2afe2a33ac90ad0f50d4de7f9163999877e2a9ca5a54f8,des_country_c1ef40ce0484c698eb4bd27fe56c1e7b68d74f9780ed674210d0e5013dae45e9,des_country_dd64913c8b91d76ca27b371f0b86fd30452370469b8cadf4e5a61d41d14e42ec,movein_00
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,2,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46603,46603,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
46604,46604,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
46605,46605,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
46606,46606,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
# Creating target and feature dataframes
features=encoded_data.drop("status",axis=1)
features.drop("Unnamed: 0", axis = 1, inplace = True)
target=encoded_data["status"]

# Feature, target arrays
X, y = features, target

In [8]:
# Since the given data is imbalanced (has majority Lost cases)
# Oversampling is required to accurately represent the sample
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train/test set generation using stratified sammpling
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

X_train_std = StandardScaler().fit_transform(X_train)
X_test_std = StandardScaler().fit_transform(X_test)

# Selecting features whose weight is above 10
Remove=FeatureSelection(CatBoostClassifier())
X_train.loc[:, Remove]

Learning rate set to 0.062897
0:	learn: 0.6794555	total: 137ms	remaining: 2m 17s
1:	learn: 0.6679955	total: 198ms	remaining: 1m 39s
2:	learn: 0.6576293	total: 256ms	remaining: 1m 25s
3:	learn: 0.6483452	total: 313ms	remaining: 1m 17s
4:	learn: 0.6401562	total: 370ms	remaining: 1m 13s
5:	learn: 0.6332859	total: 427ms	remaining: 1m 10s
6:	learn: 0.6252870	total: 485ms	remaining: 1m 8s
7:	learn: 0.6191364	total: 545ms	remaining: 1m 7s
8:	learn: 0.6123669	total: 605ms	remaining: 1m 6s
9:	learn: 0.6068846	total: 661ms	remaining: 1m 5s
10:	learn: 0.6005297	total: 724ms	remaining: 1m 5s
11:	learn: 0.5929506	total: 779ms	remaining: 1m 4s
12:	learn: 0.5870652	total: 837ms	remaining: 1m 3s
13:	learn: 0.5831412	total: 897ms	remaining: 1m 3s
14:	learn: 0.5793973	total: 954ms	remaining: 1m 2s
15:	learn: 0.5742563	total: 1.01s	remaining: 1m 2s
16:	learn: 0.5685929	total: 1.07s	remaining: 1m 1s
17:	learn: 0.5632437	total: 1.13s	remaining: 1m 1s
18:	learn: 0.5588126	total: 1.19s	remaining: 1m 1s
19:	l

10296
16486
70455
74137
34278
...
56166
52139
51422
37252
1574


In [9]:
# After testing accuracies of several models CatBoostClassifier was found to be most apt for this model
CB=CatBoostClassifier(random_state=42,silent=True)# It gives us an accuracy of 0.940628974447913

# Evaluating accuracy, precision, recall and f1 scores
evaluate_model(CB,X_train,y_train,X_test,y_test)


**Accuracy Score**
Train Accuracy is: 0.9494435612082671

Test Accuracy is: 0.9435772921725055
---------------------------------------------------------

**Accuracy Error**
Train Error: 0.05055643879173288

Test Error: 0.05642270782749448
---------------------------------------------------------

**Classification Report**
Train Classification Report:
                      0             1  accuracy     macro avg  weighted avg
precision      0.923492      0.978784  0.949444      0.951138      0.951138
recall         0.980084      0.918803  0.949444      0.949444      0.949444
f1-score       0.950947      0.947846  0.949444      0.949396      0.949396
support    34595.000000  34595.000000  0.949444  69190.000000  69190.000000

 Test Classification Report:
                     0            1  accuracy     macro avg  weighted avg
precision     0.914264     0.977355  0.943577      0.945809      0.945809
recall        0.978957     0.908197  0.943577      0.943577      0.943577
f1-score      0

In [10]:
# Generating probability values of each lead converting
lead_scores =CB.predict_proba(X_test)

# Converting probabilty values into lead scores
X_test['lead_score'] = lead_scores[:, 1]
X_test['lead_score'] = X_test['lead_score'].apply(lambda x: x * 100)
X_test['status']=y_test

# Generating an Excel Sheet of test data with lead scores
X_test.to_csv("Test-results.csv")

# Printing out the average of lead scores of won cases in test set
X_test = X_test.drop(X_test[X_test['status'] == 0].index)
print(X_test["lead_score"].mean()) #Output is 84.88666006697574

84.88666006697574
