# Code for the optimised DecisionTreeClassifier (which is currently being used in project)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
import joblib

# Load the dataset
df = pd.read_csv("C:/Users/shivansh/Downloads/Drop off rate dataset - Sheet1.csv")

# Inspect the dataset (Check the column names and first few rows)
print("Columns in the dataset:", df.columns)
print("First few rows of the dataset:\n", df.head())

# Prepare the dropOff column based on the presence of response to the next question
df = df.sort_values(by=['ppSurveyFormId', 'ppSurveyFormQuestionId'])

# Shift the 'response' column to check if the next row has a response
df['next_response'] = df['response'].shift(-1)

# Create the dropOff target based on whether the next response exists
df['dropOff'] = df['next_response'].apply(lambda x: 1 if pd.isnull(x) else 0)

# Drop the temporary 'next_response' column
df.drop(columns=['next_response'], inplace=True)

# Create features (response length, time, etc.) and target
df['response'] = df['response'].fillna('')
df['response_length'] = df['response'].apply(len)

# Convert 'time' column from HH:MM:SS to datetime, then extract hour
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour

# Define the feature columns and target
features = ['response_length', 'time', 'ppSurveyFormId', 'ppSurveyFormQuestionId', 'isLastQuestion']
target = 'dropOff'

# Split the dataset into training and testing sets
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model (using DecisionTreeClassifier with class weight and calibration)
model = CalibratedClassifierCV(DecisionTreeClassifier(random_state=42, class_weight='balanced'), method='sigmoid')
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Save the trained model as a .pkl file
# joblib.dump(model, 'dropoff_model.pkl')

Columns in the dataset: Index(['response', 'time', 'ppSurveyFormId', 'ppSurveyFormQuestionId',
       'isLastQuestion', 'dropOff'],
      dtype='object')
First few rows of the dataset:
     response      time  ppSurveyFormId  ppSurveyFormQuestionId  \
0        Yes  10:00:00             101                       1   
1         No  10:01:00             101                       2   
2        NaN  10:02:00             101                       3   
3       Good  15:30:00             102                       1   
4  Very Good  15:32:00             102                       2   

   isLastQuestion  dropOff  
0               0        0  
1               0        0  
2               0        1  
3               0        0  
4               1        0  
Model Accuracy: 0.88
              precision    recall  f1-score   support

           0       0.88      1.00      0.93         7
           1       0.00      0.00      0.00         1

    accuracy                           0.88         8
   m

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Code for simply using Decision Tree Classifier

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
df = pd.read_csv("C:/Users/shivansh/Downloads/Drop off rate dataset - Sheet1.csv")

# Inspect the dataset (Check the column names and first few rows)
print("Columns in the dataset:", df.columns)
print("First few rows of the dataset:\n", df.head())

# Prepare the dropOff column based on the presence of response to the next question
df = df.sort_values(by=['ppSurveyFormId', 'ppSurveyFormQuestionId'])

# Shift the 'response' column to check if the next row has a response
df['next_response'] = df['response'].shift(-1)

# Create the dropOff target based on whether the next response exists
df['dropOff'] = df['next_response'].apply(lambda x: 1 if pd.isnull(x) else 0)

# Drop the temporary 'next_response' column
df.drop(columns=['next_response'], inplace=True)

# Create features (response length, time, etc.) and target
df['response'] = df['response'].fillna('')
df['response_length'] = df['response'].apply(len)

# Convert 'time' column from HH:MM:SS to datetime, then extract hour
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.hour

# Define the feature columns and target
features = ['response_length', 'time', 'ppSurveyFormId', 'ppSurveyFormQuestionId', 'isLastQuestion']
target = 'dropOff'

# Split the dataset into training and testing sets
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model (using DecisionTreeClassifier)
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Save the trained model as a .pkl file
# joblib.dump(model, 'dropoff_model.pkl')

Columns in the dataset: Index(['response', 'time', 'ppSurveyFormId', 'ppSurveyFormQuestionId',
       'isLastQuestion', 'dropOff'],
      dtype='object')
First few rows of the dataset:
     response      time  ppSurveyFormId  ppSurveyFormQuestionId  \
0        Yes  10:00:00             101                       1   
1         No  10:01:00             101                       2   
2        NaN  10:02:00             101                       3   
3       Good  15:30:00             102                       1   
4  Very Good  15:32:00             102                       2   

   isLastQuestion  dropOff  
0               0        0  
1               0        0  
2               0        1  
3               0        0  
4               1        0  
Model Accuracy: 0.88
