In [2]:
import pandas as pd

In [3]:
df=pd.read_csv("data.csv")

In [4]:
df.describe()

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score,converted
count,1462.0,1281.0,1462.0,1462.0,1462.0
mean,2.031464,59886.273224,2.976744,0.506108,0.619015
std,1.449717,15070.140389,1.681564,0.288465,0.485795
min,0.0,13929.0,0.0,0.0,0.0
25%,1.0,49698.0,2.0,0.2625,0.0
50%,2.0,60148.0,3.0,0.51,1.0
75%,3.0,69639.0,4.0,0.75,1.0
max,9.0,109899.0,11.0,1.0,1.0


In [5]:
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [6]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [8]:
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Calculate the correlation matrix
correlation_matrix = df[numerical_features].corr()

# Now, check the correlation between the specified pairs
pairs = [
    ('interaction_count', 'lead_score'),
    ('number_of_courses_viewed', 'lead_score'),
    ('number_of_courses_viewed', 'interaction_count'),
    ('annual_income', 'interaction_count')
]

# Extract the correlation values for the pairs
correlations = {pair: correlation_matrix.loc[pair[0], pair[1]] for pair in pairs}

# Find the pair with the highest correlation
max_correlation_pair = max(correlations, key=correlations.get)
max_correlation_value = correlations[max_correlation_pair]

# Display the results
print(f"The two features with the biggest correlation are: {max_correlation_pair}")
print(f"The correlation value is: {max_correlation_value}")

The two features with the biggest correlation are: ('annual_income', 'interaction_count')
The correlation value is: 0.048618416552580965


In [12]:
import numpy as np
from sklearn.model_selection import train_test_split

# Assume df is already loaded

# Separate target and features
X = df.drop(columns=['converted'])
y = df['converted']

# Split 60% train, 40% temp (which will be split again)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

# Split remaining 40% equally into 20% val and 20% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))

Train size: 877
Validation size: 292
Test size: 293


In [14]:
from sklearn.feature_selection import mutual_info_classif
# Select categorical features
categorical_features = ['lead_source', 'industry', 'employment_status', 'location']

# Convert categorical columns to numeric codes for MI computation
X_train_cat = X_train[categorical_features].apply(lambda x: x.astype('category').cat.codes)

# Compute mutual information
mi_scores = mutual_info_classif(X_train_cat, y_train, random_state=42)

# Pair variable names with their MI scores
mi_results = {feature: round(score, 2) for feature, score in zip(categorical_features, mi_scores)}

# Display all MI scores
print("Mutual Information Scores:")
for feature, score in mi_results.items():
    print(f"{feature}: {score}")

# Find the variable with the highest MI score
max_mi_feature = max(mi_results, key=mi_results.get)
print(f"\nFeature with highest mutual information: {max_mi_feature}")

Mutual Information Scores:
lead_source: 0.03
industry: 0.0
employment_status: 0.0
location: 0.0

Feature with highest mutual information: lead_source


In [16]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

# Identify categorical and numerical columns
categorical_features = ['lead_source', 'industry', 'employment_status', 'location']
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Preprocess: Handle missing data and one-hot encode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing categorical values
            ('encoder', OneHotEncoder(handle_unknown='ignore'))   # One-hot encode categorical features
        ]), categorical_features),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean'))  # Impute missing numerical values with mean
        ]), numerical_features)
    ]
)

# Define logistic regression model with specified parameters
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Fit the model
pipeline.fit(X_train, y_train)

# Predict on validation set
y_val_pred = pipeline.predict(X_val)

# Calculate accuracy
val_accuracy = round(accuracy_score(y_val, y_val_pred), 2)

print(f"Validation Accuracy: {val_accuracy}")



Validation Accuracy: 0.7


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import numpy as np

# List of features
all_features = ['lead_source', 'industry', 'employment_status', 'location', 
                'number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Categorical and numerical features
categorical_features = ['lead_source', 'industry', 'employment_status', 'location']
numerical_features = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

# Preprocessor for handling missing values and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean'))
        ]), numerical_features)
    ]
)

# Define Logistic Regression Model
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Fit the model with all features
pipeline.fit(X_train, y_train)

# Get the baseline accuracy
y_val_pred = pipeline.predict(X_val)
baseline_accuracy = accuracy_score(y_val, y_val_pred)

# Track the accuracy difference for each feature
accuracy_differences = {}

for feature in all_features:
    # Exclude one feature from the dataset
    features_excluding_one = [f for f in all_features if f != feature]
    
    # Re-create the preprocessor and pipeline excluding the feature
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))
            ]), [f for f in categorical_features if f != feature]),
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='mean'))
            ]), [f for f in numerical_features if f != feature])
        ]
    )
    
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Fit the model and get accuracy
    pipeline.fit(X_train[features_excluding_one], y_train)
    y_val_pred = pipeline.predict(X_val[features_excluding_one])
    accuracy = accuracy_score(y_val, y_val_pred)
    
    # Calculate the difference in accuracy
    accuracy_difference = baseline_accuracy - accuracy
    accuracy_differences[feature] = accuracy_difference

# Find the feature with the smallest accuracy difference
least_useful_feature = min(accuracy_differences, key=accuracy_differences.get)
print(f"The least useful feature is: {least_useful_feature}")


The least useful feature is: annual_income


In [19]:
# Print the accuracy differences for all features
for feature, difference in accuracy_differences.items():
    print(f"Feature: {feature}, Accuracy Difference: {difference}")

Feature: lead_source, Accuracy Difference: 0.013698630136986356
Feature: industry, Accuracy Difference: 0.003424657534246589
Feature: employment_status, Accuracy Difference: 0.003424657534246589
Feature: location, Accuracy Difference: 0.0
Feature: number_of_courses_viewed, Accuracy Difference: 0.07534246575342463
Feature: annual_income, Accuracy Difference: -0.13356164383561642
Feature: interaction_count, Accuracy Difference: 0.07534246575342463
Feature: lead_score, Accuracy Difference: 0.0


In [20]:
# Different C values to try
C_values = [0.01, 0.1, 1, 10, 100]

# Initialize variables to track best accuracy and corresponding C
best_accuracy = 0
best_C = None

# Train logistic regression for each C value
for C in C_values:
    # Create the logistic regression model with the current C value
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    # Create pipeline (same preprocessing as before)
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Predict on validation set
    y_val_pred = pipeline.predict(X_val)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_val_pred)
    
    # Check if this is the best accuracy so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_C = C
    elif accuracy == best_accuracy and C < best_C:  # In case of a tie, choose the smallest C
        best_C = C

# Display the best C value
print(f"The best value of C is: {best_C}")


The best value of C is: 0.01
