# Import the created functions

In [1]:
import numpy as np
import pandas as pd

from src.prepare_data import prepare
from src.preprocess_data import preprocess
from src.train_models import model_1, model_2

# Process and retrieve the data

In [2]:
# Retrieve data
df = prepare('office_churn_dataset.csv')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1543 entries, 0 to 1542
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Branch                   1535 non-null   object 
 1   Tenure                   1534 non-null   float64
 2   Salary                   1534 non-null   float64
 3   Department               1543 non-null   object 
 4   JobSatisfaction          1515 non-null   float64
 5   WorkLifeBalance          1515 non-null   float64
 6   CommuteDistance          1543 non-null   object 
 7   MaritalStatus            1543 non-null   object 
 8   Education                1543 non-null   object 
 9   PerformanceRating        1536 non-null   float64
 10  TrainingHours            1352 non-null   float64
 11  OverTime                 1443 non-null   object 
 12  NumProjects              1444 non-null   float64
 13  YearsSincePromotion      1542 non-null   float64
 14  EnvironmentSatisfaction 

Unnamed: 0,Branch,Tenure,Salary,Department,JobSatisfaction,WorkLifeBalance,CommuteDistance,MaritalStatus,Education,PerformanceRating,TrainingHours,OverTime,NumProjects,YearsSincePromotion,EnvironmentSatisfaction,ChurnLikelihood
0,San Francisco,4.0,63000.0,Legal,3.0,3.0,Long,Married,High School,3.0,88.0,True,3.0,0.0,2.0,Highly Likely to Churn
1,Chicago,14.0,72000.0,Accounting,4.0,4.0,Short,Single,Bachelor,3.666667,30.0,True,3.0,2.0,3.0,Moderately Likely to Churn
2,Miami,4.0,40000.0,Quality Assurance,3.0,3.0,Medium,Single,High School,3.666667,64.0,,,0.0,4.0,Highly Likely to Churn
3,Scranton,2.0,55000.0,Legal,3.0,3.5,Short,Married,Bachelor,3.666667,30.0,True,4.0,0.0,3.0,Moderately Likely to Churn
4,Scranton,10.0,55500.0,Legal,3.0,3.0,Medium,Married,Bachelor,3.333333,18.0,,4.0,1.0,3.0,Moderately Likely to Churn


# Preprocess the data

In [3]:
# Preprocess the data
preprocessed_df = preprocess(df)

print(preprocessed_df.info())
preprocessed_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 904 entries, 1 to 1541
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Branch                   904 non-null    object 
 1   Tenure                   904 non-null    float64
 2   Salary                   904 non-null    float64
 3   Department               904 non-null    object 
 4   JobSatisfaction          904 non-null    float64
 5   WorkLifeBalance          904 non-null    float64
 6   CommuteDistance          904 non-null    object 
 7   MaritalStatus            904 non-null    object 
 8   Education                904 non-null    object 
 9   PerformanceRating        904 non-null    float64
 10  TrainingHours            904 non-null    float64
 11  NumProjects              904 non-null    float64
 12  YearsSincePromotion      904 non-null    float64
 13  EnvironmentSatisfaction  904 non-null    float64
 14  ChurnLikelihood          904 n

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ChurnLikelihood'] = df['ChurnLikelihood'].map({'Slightly Likely to Churn' : 0, 'Moderately Likely to Churn' : 1, 'Highly Likely to Churn' : 2}).astype('int')


Unnamed: 0,Branch,Tenure,Salary,Department,JobSatisfaction,WorkLifeBalance,CommuteDistance,MaritalStatus,Education,PerformanceRating,TrainingHours,NumProjects,YearsSincePromotion,EnvironmentSatisfaction,ChurnLikelihood
1,Chicago,14.0,72000.0,Accounting,4.0,4.0,Short,Single,Bachelor,3.666667,30.0,3.0,2.0,3.0,1
3,Scranton,2.0,55000.0,Legal,3.0,3.5,Short,Married,Bachelor,3.666667,30.0,4.0,0.0,3.0,1
4,Scranton,10.0,55500.0,Legal,3.0,3.0,Medium,Married,Bachelor,3.333333,18.0,4.0,1.0,3.0,1
6,Boston,10.0,82000.0,Sales,5.0,3.0,Medium,Married,Bachelor,3.666667,34.645646,3.0,1.0,3.0,1
7,New York,6.0,59000.0,Administration,3.0,3.5,Short,Divorced,High School,3.333333,40.0,3.0,0.0,3.0,1


# Model 1

## Create and fit Model 1

In [4]:
model_1 = model_1(preprocessed_df)

Logistic Regression: 98.895
Training MAE: 0.01
Test data MAE: 0.01


## Feature Importance

In [7]:
features = model_1.named_steps["onehotencoder"].get_feature_names()
coefs = model_1.named_steps["logisticregression"].coef_[0]



In [8]:
import numpy as np

odds_ratios = pd.Series(np.exp(coefs), index=features).sort_values()
odds_ratios.head()

Branch_Miami             0.519009
TrainingHours            0.596666
Branch_Atlanta           0.610408
Branch_Seattle           0.663997
Education_High School    0.671927
dtype: float64

### Plot Feature Importance

In [10]:
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [11]:
plt.Figure(figsize=(15,10))
fig = px.bar(
    data_frame=odds_ratios, 
    x=odds_ratios[:10].values, 
    y=odds_ratios[:10].index,
    title="Customer Churn Logistic Regression, Feature Importance (Odds Ratio)"
)

fig.update_layout(xaxis_title='Odds Ratio', yaxis_title='')
fig.show()

## Predict

In [16]:
def make_prediction(row):

    data = {
        "Branch": row[1],
        "Tenure": row[2],
        "Salary": row[3],
        "Department": row[4],
        "JobSatisfaction": row[5],
        "WorkLifeBalance": row[6],
        "CommuteDistance": row[7],
        "MaritalStatus": row[8],
        "Education": row[9],
        "PerformanceRating": row[10],
        "TrainingHours": row[11],
        "NumProjects": row[12],
        "YearsSincePromotion": row[13],
        "EnvironmentSatisfaction": row[14]
    }

    df_predict = pd.DataFrame(data, index=[0])
    prediction = model_1.predict(df_predict)[0]
    return f"Prediction: {prediction}"

In [17]:
print(make_prediction(pd.Series([1,'San Francisco',4.0,63000.0,'Legal',3.0,3.0,'Long','Married','High School',3.0,88.0,3.0,0.0,2.0]))) # Should output 2
print(make_prediction(pd.Series([2,'Chicago',14.0,72000.0,'Accounting',4.0,4.0,'Short','Single','Bachelor',3.6666666666666665,30.0,3.0,2.0,3.0]))) # Should output 1

Prediction: 2
Prediction: 1


## Save Model 1

In [18]:
import joblib
# Save Model
joblib.dump(model_1, '../artifacts/model_1.pkl')

['../artifacts/model_1.pkl']

# Model 2 (With Feature Engineering)

In [None]:
model_2 = model_2(preprocessed_df)

In [None]:
def make_prediction_model_2(row):

    data = {
        "Gender": row[0],
        "Married": row[1],
        "Dependents": row[2],
        "Education": row[3],
        "Self_Employed": row[4],
        "LoanAmount": row[5],
        "Loan_Amount_Term": row[6],
        "Credit_History": row[7],
        "Property_Area": row[8],
        "Income": row[9]
    }

    df_predict = pd.DataFrame(data, index=[0])
    prediction = model_2.predict(df_predict)[0]
    
    if prediction == 0:
        return "Rejected"
    elif prediction == 1:
        return "Approved"
    else:
        return "Error"

In [None]:
print(make_prediction_model_2(pd.Series(['Male', 'Yes', '1', 'Graduate', 'No', 128.0, 360.0, 1.0, 'Rural', (4583 + 1508)]))) # Should output 1
print(make_prediction_model_2(pd.Series(['Male', 'No', '0', 'Graduate', 'No', 141.0, 360.0, 1.0, 'Urban', (6000 + 0)]))) # Should output 1
print(make_prediction_model_2(pd.Series(['Female', 'No', '4', 'Not Graduate', 'Yes', 1000.0, 360.0, 0.0, 'Rural', (200 + 0)]))) # Should output 0

## Save Model 2

In [None]:
import joblib
# Save Model
joblib.dump(model_2, '../artifacts/model_2.pkl')