In [1]:
!pip install scikit-learn --upgrade
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.0


## 1. Data reading

In [2]:
# Step 1: Extract Sheet ID and Sheet name from the link
link = "https://docs.google.com/spreadsheets/d/1_J4zHZYnSp_NcnpATIFnpOp9i4BOslGLLOdKsIb71vk/edit#gid=645073837"

sheet_id = link.split("/")[5]
sheet_name = link.split("gid=")[1]

# Step 2: Construct the URL
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={sheet_name}"

# Step 3: Read the URL as a pandas DataFrame
df = pd.read_csv(url)

# Print the DataFrame
df

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,MT,83,area_code_415,no,no,0,188.3,70,32.01,243.8,88,20.72,213.7,79,9.62,10.3,6,2.78,0,no
4246,WV,73,area_code_408,no,no,0,177.9,89,30.24,131.2,82,11.15,186.2,89,8.38,11.5,6,3.11,3,no
4247,NC,75,area_code_408,no,no,0,170.7,101,29.02,193.1,126,16.41,129.1,104,5.81,6.9,7,1.86,1,no
4248,HI,50,area_code_408,no,yes,40,235.7,127,40.07,223.0,126,18.96,297.5,116,13.39,9.9,5,2.67,2,no


In [3]:
# X and y creation:

# Assuming 'expensive' is the target variable
y = df.pop('churn')
X = df

 To enhance the model for our classification project, we can consider incorporating the stratified sampling technique to maintain the class distribution in both the training and testing sets.

In [4]:
# data splitting:


from sklearn.model_selection import train_test_split

# Split the data into training and testing sets, with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=88)


## 3. Categorical encoding - "Automated" approach (Using Pipelines)

### 3.1. Creating the "numeric pipe" and the "categoric pipe"

In [5]:
# select categorical and numerical column names
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

In [9]:
# Numerical pipeline | Without Scaling (Current Approach):
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])


# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='N_A')),
    ('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)),
    ('onehot_encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


### 3.2. Using `ColumnTransformer` a pipeline with 2 branches (the `preprocessor`)

We simply tell the pipeline the following:

- One branch, called `"num_pipe"`, will apply the steps in the `numeric_pipe` to the columns named in `X_num_columns`
- The second branch, called `"cat_pipe"`, will apply the steps in the `categoric_pipe` to the columns named in `X_cat_columns`

In [10]:
from sklearn.compose import ColumnTransformer

# ColumnTransformer for preprocessing
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, X_num_columns),
    ('cat_pipeline', cat_pipeline, X_cat_columns)
])

### 3.3. Creating the `full_pipeline` (`preprocessor` +  Random Forest Classifier)

Pipelines are modular. The `preprocessor` we created above with the `ColumnTransformer` can become now a step in a new pipeline, that we'll call `full_piepline` and will include, as a last step, a  Random Forest Classifier model:

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier

# Full pipeline with preprocessor and Random Forest classifier
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('randomforestclassifier', RandomForestClassifier())
]).set_output(transform='pandas')


# Fit the pipeline on the training data
full_pipeline.fit(X_train, y_train)

In [None]:
# Make predictions on the training data
pipeline_pred = full_pipeline.predict(X_train)
pipeline_pred

array(['no', 'no', 'no', ..., 'no', 'yes', 'yes'], dtype=object)

In [None]:
# Calculate accuracy score for the training data
accuracy = accuracy_score(y_true=y_train, y_pred=pipeline_pred)
accuracy

1.0

In [None]:
# Make predictions on the test data
pipeline_test = full_pipeline.predict(X_test)

In [None]:
# Calculate accuracy score
accuracy = accuracy_score(y_true=y_test, y_pred=pipeline_test)
accuracy

0.9470588235294117

## 4. Train our model with GridSearch cross validation

### 4.1 Defining ParameterGrid

In [None]:
paramGrid = {
    'preprocessor__num_pipeline__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat_pipeline__imputer__strategy': ['most_frequent', 'constant'],
    'preprocessor__cat_pipeline__ordinal_encoder__handle_unknown': ['use_encoded_value'],
    'preprocessor__cat_pipeline__ordinal_encoder__unknown_value': [-1],
    'preprocessor__cat_pipeline__onehot_encoder__handle_unknown': ['ignore'],
    'preprocessor__cat_pipeline__onehot_encoder__sparse': [False, True],
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_depth': [None, 5, 10],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__max_features': ['auto', 'sqrt']
}

### 4.2 Define cross validation: GridSearchCV

In [None]:
# GridSearchCV for hyperparameter tuning

from sklearn.model_selection import GridSearchCV

Search = GridSearchCV(full_pipeline, paramGrid, cv=5, verbose=1)

In [None]:
# Fit your "search" to the training data (X and y)
Search.fit(X_train, y_train)

### 4.3  Predict on the Training set

In [None]:
# Predict on the training set using the best model found by GridSearchCV
train_pred = Search.predict(X_train)

# Calculate accuracy score on the training set
train_accuracy = accuracy_score(y_true=y_train, y_pred=train_pred)
train_accuracy

1.0

### 4.4  Predict on the Test set

In [None]:
# Predict on the test set using the best model found by GridSearchCV
test_pred = Search.predict(X_test)

# Calculate accuracy score on the test set
test_accuracy = accuracy_score(y_true=y_test, y_pred=test_pred)

# Print the accuracy for the testing data set
print("Accuracy for the testing data set is:", test_accuracy)

Accuracy for the testing data set is: 0.9529411764705882


## 5. Competitions database

In [None]:
# Step 1: Extract Sheet ID and Sheet name from the link
link = "https://docs.google.com/spreadsheets/d/1kP0wYEZuRlXpgoR8dBpXBZpFwoMdsTLm6z4c7PyE2n4/edit#gid=1354694019"

sheet_id = link.split("/")[5]
sheet_name = link.split("gid=")[1]

# Step 2: Construct the URL
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={sheet_name}"

# Step 3: Read the URL as a pandas DataFrame
new_data  = pd.read_csv(url)

# Print the DataFrame
new_data

Unnamed: 0,id,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,1,KS,128,area_code_415,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1
1,2,AL,118,area_code_510,yes,no,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.70,0
2,3,IA,62,area_code_415,no,no,0,120.7,70,20.52,307.2,76,26.11,203.0,99,9.14,13.1,6,3.54,4
3,4,VT,93,area_code_510,no,no,0,190.7,114,32.42,218.2,111,18.55,129.6,121,5.83,8.1,3,2.19,3
4,5,NE,174,area_code_415,no,no,0,124.3,76,21.13,277.1,112,23.55,250.7,115,11.28,15.5,5,4.19,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,746,GA,130,area_code_415,no,no,0,119.4,99,20.30,226.3,97,19.24,202.7,111,9.12,11.3,7,3.05,0
746,747,WA,73,area_code_408,no,no,0,177.2,118,30.12,270.5,84,22.99,241.8,112,10.88,12.3,2,3.32,3
747,748,WV,152,area_code_415,no,no,0,184.2,90,31.31,256.8,73,21.83,213.6,113,9.61,14.7,2,3.97,3
748,749,DC,61,area_code_415,no,no,0,140.6,89,23.90,172.8,128,14.69,212.4,97,9.56,13.6,4,3.67,1


In [None]:
comp_pred = Search.predict(new_data)


In [None]:
comp_pred

array(['no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'yes', 'no', 'yes', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes',
       'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no', 'no',
       'no', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no',
       'yes', 'no', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'yes',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no',
       'no', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no',
       'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no',
       'no', 'no', 'no', 'no', 'no', 'no

In [None]:
comp_pred = pd.DataFrame(comp_pred)
comp_pred['id']= new_data['id']
comp_pred.rename(columns={0:'churn'}, inplace = True)
comp_pred = comp_pred[['id', 'churn']]
comp_pred

Unnamed: 0,id,churn
0,1,no
1,2,no
2,3,yes
3,4,no
4,5,no
...,...,...
745,746,no
746,747,no
747,748,no
748,749,no


In [None]:
comp_pred = pd.DataFrame(comp_pred)
comp_pred['id']= new_data['id']
comp_pred.rename(columns={0:'churn'}, inplace = True)
comp_pred = comp_pred[['id', 'churn']]
comp_pred

Unnamed: 0,id,churn
0,1,no
1,2,no
2,3,yes
3,4,no
4,5,no
...,...,...
745,746,no
746,747,no
747,748,no
748,749,no


In [None]:
# Export the comp_pred DataFrame to a CSV file
comp_pred.to_csv("comp_pred.csv", index = False)

In [None]:
# Download the CSV file
from google.colab import files
files.download("comp_pred.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>