In [1]:
#importing required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [2]:
df= pd.read_csv('course_lead_scoring.csv')

In [3]:
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


#### Data preparation

Check if the missing values are presented in the features.


In [6]:
df.isna().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
categorical= ['lead_source','industry','employment_status','location']
numerical=['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

If there are missing values:
For caterogiral features, replace them with 'NA'
For numerical features, replace with with 0.0

In [9]:
df[categorical]= df[categorical].fillna('NA')
df[numerical]= df[numerical].fillna(0.0)

In [10]:
df.isna().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

##### Question 1: What is the most frequent observation (mode) for the column industry?

In [12]:
df.industry.mode()

0    retail
Name: industry, dtype: object

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

In [14]:
correlation_matrix= df[numerical].corr()
correlation_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


##### Question 2: What are the two features that have the biggest correlation?

In [16]:
#Answer: annual_income and interaction_count

#### Split the data

In [18]:
df_full_train, df_test= train_test_split(df, test_size=0.2, random_state= 42)

In [19]:
df_train, df_val= train_test_split(df_full_train, test_size=0.25, random_state= 42)

In [20]:
df_train= df_train.reset_index(drop=True)
df_test= df_test.reset_index(drop=True)
df_val= df_val.reset_index(drop=True)

In [21]:
y_train= df_train.converted.values
y_test= df_test.converted.values
y_val= df_val.converted.values

In [22]:
del df_train['converted']
del df_test['converted']
del df_val['converted']

In [23]:
#Calculate the mutual information score between y and other categorical variables in the dataset. 
#Use the training set only.

In [24]:
def mutual_info_converted_score(series):
    return mutual_info_score(series, df_full_train.converted)

In [25]:
mi= df_full_train[categorical].apply(mutual_info_converted_score)
mi.sort_values(ascending=False)

lead_source          0.025665
employment_status    0.013258
industry             0.011685
location             0.002253
dtype: float64

##### Question 3: Which of these variables has the biggest mutual information score?

In [27]:
#Answer lead_source

In [28]:
dicts_train= df_train[categorical + numerical].to_dict(orient='records')
dicts_val= df_val[categorical + numerical].to_dict(orient='records')

In [29]:
dv= DictVectorizer(sparse= False)
X_train= dv.fit_transform(dicts_train)
X_val= dv.fit_transform(dicts_val)

In [30]:
model= LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [31]:
model.fit(X_train, y_train)

In [32]:
y_pred= model.predict_proba(X_val)[:,1]

In [33]:
churn_decision= (y_pred >= 0.5)

In [34]:
df_pred= pd.DataFrame()
df_pred['probability']= y_pred
df_pred['prediction']= churn_decision.astype(int)
df_pred['actual']= y_val
df_pred['correct']= df_pred.prediction == df_pred.actual
df_pred

Unnamed: 0,probability,prediction,actual,correct
0,0.611922,1,0,False
1,0.799826,1,1,True
2,0.530213,1,0,False
3,0.471315,0,0,True
4,0.570661,1,0,False
...,...,...,...,...
288,0.419342,0,0,True
289,0.710539,1,1,True
290,0.418185,0,0,True
291,0.744835,1,1,True


In [35]:
round(df_pred.correct.mean(),2)

0.7

In [36]:
y_predf= model.predict(X_val)

In [37]:
accuracy= accuracy_score(y_predf, y_val)
print(accuracy)

0.6996587030716723


In [38]:
categorical 

['lead_source', 'industry', 'employment_status', 'location']

In [39]:
numerical

['number_of_courses_viewed',
 'annual_income',
 'interaction_count',
 'lead_score']

In [40]:
#Without industry
dicts_train= df_train[['lead_source', 'employment_status', 'location'] + numerical].to_dict(orient='records')
dicts_val= df_val[['lead_source', 'employment_status', 'location'] + numerical].to_dict(orient='records')
dv= DictVectorizer(sparse= False)
X_train= dv.fit_transform(dicts_train)
X_val= dv.fit_transform(dicts_val)
model.fit(X_train, y_train)
y_predf= model.predict(X_val)
accuracy2= accuracy_score(y_predf, y_val)
print(accuracy2)

0.6996587030716723


In [41]:
#Without 'employment_status'
dicts_train= df_train[['industry', 'lead_source', 'location'] + numerical].to_dict(orient='records')
dicts_val= df_val[['industry', 'lead_source', 'location'] + numerical].to_dict(orient='records')
dv= DictVectorizer(sparse= False)
X_train= dv.fit_transform(dicts_train)
X_val= dv.fit_transform(dicts_val)
model.fit(X_train, y_train)
y_predf= model.predict(X_val)
accuracy3= accuracy_score(y_predf, y_val)
print(accuracy3)

0.6962457337883959


In [42]:
#Without lead_score
dicts_train= df_train[categorical+ ['number_of_courses_viewed','annual_income','interaction_count']].to_dict(orient='records')
dicts_val= df_val[categorical+ ['number_of_courses_viewed','annual_income','interaction_count']].to_dict(orient='records')
dv= DictVectorizer(sparse= False)
X_train= dv.fit_transform(dicts_train)
X_val= dv.fit_transform(dicts_val)
model.fit(X_train, y_train)
y_predf= model.predict(X_val)
accuracy4= accuracy_score(y_predf, y_val)
print(accuracy4)

0.7064846416382252


In [43]:
print(accuracy-accuracy2, accuracy-accuracy3, accuracy-accuracy4)

0.0 0.0034129692832763903 -0.0068259385665528916


##### Question 5 Which of following feature has the smallest difference?

In [45]:
#Answer: lead_score

In [96]:
models = {}
accuracies = {}
for c in [0.01, 0.1, 1, 10, 100]:
    
    model= LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    models[c] = model
    y_predv= model.predict(X_val)
    score= accuracy_score(y_predv, y_val)
    accuracies[c] = score
print(c, accuracies)
    
    
    
    

100 {0.01: 0.6996587030716723, 0.1: 0.6996587030716723, 1: 0.6996587030716723, 10: 0.6996587030716723, 100: 0.6996587030716723}


##### Which of these C leads to the best accuracy on the validation set?

In [101]:
#0.01