In [110]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score, mean_squared_error



In [121]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'

df = pd.read_csv(data)

df.head().T


Unnamed: 0,0,1,2,3,4
lead_source,paid_ads,social_media,events,paid_ads,referral
industry,,retail,healthcare,retail,education
number_of_courses_viewed,1,1,5,2,3
annual_income,79450.0,46992.0,78796.0,83843.0,85012.0
employment_status,unemployed,employed,unemployed,,self_employed
location,south_america,south_america,australia,australia,europe
interaction_count,4,1,3,1,3
lead_score,0.94,0.8,0.69,0.87,0.62
converted,1,0,1,0,1


In [130]:
categoricas= ['lead_source', 'industry', 'employment_status', 'location']
numericas = ['number_of_courses_viewed','annual_income', 'interaction_count', 'lead_score']

In [131]:


for col in df[numericas]:
    df[col] = pd.to_numeric(df[col], errors='coerce') 
#df.annual_income = df.annual_income.fillna(0.0)

for col in df[categoricas]:
   df[col] = df[col].fillna('NA')

df[numericas] = df[numericas].fillna(0.0)
    




In [132]:
###Q1
df.industry.value_counts()


industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

In [133]:
###Q2
matrix_corr_num = df[numericas].corr

matrix_corr_num().unstack().sort_values(ascending =False)


number_of_courses_viewed  number_of_courses_viewed    1.000000
annual_income             annual_income               1.000000
lead_score                lead_score                  1.000000
interaction_count         interaction_count           1.000000
annual_income             interaction_count           0.027036
interaction_count         annual_income               0.027036
lead_score                annual_income               0.015610
annual_income             lead_score                  0.015610
lead_score                interaction_count           0.009888
interaction_count         lead_score                  0.009888
annual_income             number_of_courses_viewed    0.009770
number_of_courses_viewed  annual_income               0.009770
lead_score                number_of_courses_viewed   -0.004879
number_of_courses_viewed  lead_score                 -0.004879
                          interaction_count          -0.023565
interaction_count         number_of_courses_viewed   -0

In [134]:
# Split de df con sklearn
from sklearn.model_selection import train_test_split


df_full_train, df_test= train_test_split(df,test_size=0.2, random_state= 42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state= 42)

df_train.reset_index(drop=True)
df_val.reset_index(drop=True)
df_test.reset_index(drop=True)
df_full_train.reset_index(drop=True)

y_train = df_train.converted
y_val = df_val.converted
y_test = df_test.converted
y_full_train = df_full_train.converted

del df_train['converted']
del df_val['converted']
del df_test['converted']
del df_full_train['converted']

In [128]:
###Q3
from sklearn.metrics import mutual_info_score

def info_mutua_y(series):
    return mutual_info_score(y_train,series)

im = df_train[categoricas].apply(info_mutua_y).round(2)
im.sort_values(ascending=False)

lead_source          0.04
industry             0.01
employment_status    0.01
location             0.00
dtype: float64

In [135]:
###Q4
#Obtenemos la X
dict_train= df_train[categoricas + numericas].to_dict(orient='records')
dict_val= df_val[categoricas + numericas].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train= dv.fit_transform(dict_train)
X_val= dv.transform(dict_val)

# Modelo
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

score_orig = accuracy_score(y_pred, y_val)
Accuracy = np.round(score_orig,2)
Accuracy


np.float64(0.7)

In [119]:
###Q5

features = df_train.columns.to_list()

scores = pd.DataFrame(columns = ['col_eliminada', 'accuracy', 'diferencia'])
for feature in features:
    subset = features.copy()
    subset.remove(feature)

    dict_train= df_train[subset].to_dict(orient='records')
    dict_val= df_val[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train= dv.fit_transform(dict_train)
    X_val= dv.transform(dict_val)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)

    scores.loc[len(scores)] = [feature, score, score_orig - score]

scores

Unnamed: 0,col_eliminada,accuracy,diferencia
0,lead_source,0.703072,-0.003413
1,industry,0.699659,0.0
2,number_of_courses_viewed,0.556314,0.143345
3,annual_income,0.853242,-0.153584
4,employment_status,0.696246,0.003413
5,location,0.709898,-0.010239
6,interaction_count,0.556314,0.143345
7,lead_score,0.706485,-0.006826


In [120]:
###Q6
dict_train= df_train[categoricas + numericas].to_dict(orient='records')
dict_val= df_val[categoricas + numericas].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train= dv.fit_transform(dict_train)
X_val= dv.transform(dict_val)

C= [0.01, 0.1, 1, 10, 100]
for c in C:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    score_orig = accuracy_score(y_val, y_pred)
    Accuracy = np.round(score_orig,3)
    print(f'C = {c}:\t Score = {score_orig}\t Accuracy = {Accuracy}')


C = 0.01:	 Score = 0.6996587030716723	 Accuracy = 0.7
C = 0.1:	 Score = 0.6996587030716723	 Accuracy = 0.7
C = 1:	 Score = 0.6996587030716723	 Accuracy = 0.7
C = 10:	 Score = 0.6996587030716723	 Accuracy = 0.7
C = 100:	 Score = 0.6996587030716723	 Accuracy = 0.7
