In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("course_lead_scoring.csv")

In [3]:
print(df.head())
print(df.shape)

    lead_source    industry  number_of_courses_viewed  annual_income  \
0      paid_ads         NaN                         1        79450.0   
1  social_media      retail                         1        46992.0   
2        events  healthcare                         5        78796.0   
3      paid_ads      retail                         2        83843.0   
4      referral   education                         3        85012.0   

  employment_status       location  interaction_count  lead_score  converted  
0        unemployed  south_america                  4        0.94          1  
1          employed  south_america                  1        0.80          0  
2        unemployed      australia                  3        0.69          1  
3               NaN      australia                  1        0.87          0  
4     self_employed         europe                  3        0.62          1  
(1462, 9)


In [4]:
print(df.tail())

         lead_source       industry  number_of_courses_viewed  annual_income  \
1457        referral  manufacturing                         1            NaN   
1458        referral     technology                         3        65259.0   
1459        paid_ads     technology                         1        45688.0   
1460        referral            NaN                         5        71016.0   
1461  organic_search        finance                         3        92855.0   

     employment_status       location  interaction_count  lead_score  \
1457     self_employed  north_america                  4        0.53   
1458           student         europe                  2        0.24   
1459           student  north_america                  3        0.02   
1460     self_employed  north_america                  0        0.25   
1461           student  north_america                  3        0.41   

      converted  
1457          1  
1458          1  
1459          1  
1460          

In [5]:
missing_summary = df.isnull().sum()
print("Missing values in each column:\n", missing_summary)

Missing values in each column:
 lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64


In [6]:
# Categorical columns with missing values
categorical_cols = ['lead_source', 'industry', 'employment_status', 'location']

# Numerical columns with missing values
numerical_cols = ['annual_income']

# Fill missing values
df[categorical_cols] = df[categorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

# Verify
print(df.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


In [7]:
numerical_cols = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
corr_matrix = df[numerical_cols].corr()
print(corr_matrix)

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                  1.000000       0.009770   
annual_income                             0.009770       1.000000   
interaction_count                        -0.023565       0.027036   
lead_score                               -0.004879       0.015610   

                          interaction_count  lead_score  
number_of_courses_viewed          -0.023565   -0.004879  
annual_income                      0.027036    0.015610  
interaction_count                  1.000000    0.009888  
lead_score                         0.009888    1.000000  


In [8]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = df.drop('converted', axis=1)
y = df['converted']

# First, split into train (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

# Then split temp into validation (20%) and test (20%) → each half of temp
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Check sizes
print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

Train: (877, 8), Validation: (292, 8), Test: (293, 8)


In [9]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

# Categorical features
categorical_cols = ['industry', 'location', 'lead_source', 'employment_status']

# Encode categorical features
X_train_encoded = X_train.copy()
for col in categorical_cols:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train_encoded[col])

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train_encoded[categorical_cols], y_train, discrete_features=True)

# Round scores to 2 decimals and create a dictionary
mi_scores_rounded = {col: round(score, 2) for col, score in zip(categorical_cols, mi_scores)}
print(mi_scores_rounded)

{'industry': 0.01, 'location': 0.0, 'lead_source': 0.03, 'employment_status': 0.01}


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Categorical and numerical columns
categorical_cols = ['industry', 'location', 'lead_source', 'employment_status']
numerical_cols = [col for col in X_train.columns if col not in categorical_cols]

# Preprocessing: one-hot for categorical, passthrough for numerical
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

# Logistic Regression model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Fit on training data
pipeline.fit(X_train, y_train)

# Predict on validation set
y_val_pred = pipeline.predict(X_val)

# Accuracy
accuracy = accuracy_score(y_val, y_val_pred)
print(round(accuracy, 2))

0.68


In [11]:
features_to_test = ['industry', 'employment_status', 'lead_score']
diff_dict = {}

for feat in features_to_test:
    # Drop feature from X
    X_train_temp = X_train.drop(feat, axis=1)
    X_val_temp = X_val.drop(feat, axis=1)
    
    # Update categorical columns
    cat_cols_temp = [col for col in categorical_cols if col in X_train_temp.columns]
    num_cols_temp = [col for col in X_train_temp.columns if col not in cat_cols_temp]
    
    preprocessor_temp = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first'), cat_cols_temp),
            ('num', 'passthrough', num_cols_temp)
        ]
    )
    
    pipeline_temp = Pipeline([
        ('preprocessor', preprocessor_temp),
        ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
    ])
    
    pipeline_temp.fit(X_train_temp, y_train)
    y_val_pred_temp = pipeline_temp.predict(X_val_temp)
    acc_temp = accuracy_score(y_val, y_val_pred_temp)
    
    diff_dict[feat] = accuracy - acc_temp

print("Accuracy differences after removing each feature:", diff_dict)

Accuracy differences after removing each feature: {'industry': 0.0, 'employment_status': 0.003424657534246589, 'lead_score': 0.006849315068493067}


In [12]:
C_values = [0.01, 0.1, 1, 10, 100]
accuracy_dict = {}

for C_val in C_values:
    model = LogisticRegression(solver='liblinear', C=C_val, max_iter=1000, random_state=42)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    accuracy_dict[C_val] = round(accuracy_score(y_val, y_val_pred), 3)

print("Validation accuracies for different C values:", accuracy_dict)


Validation accuracies for different C values: {0.01: 0.688, 0.1: 0.682, 1: 0.685, 10: 0.685, 100: 0.685}
