# Case Study 1 : Social Network Product Ads

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
df = pd.read_csv('gdrive/MyDrive/Datasets/Social_Network_Ads.csv')
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


The dataset has:
- Id column
- Gender: Categorical column (Male/Female)
- Numerical features: Age and Estimated Salary

In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
User ID,400.0,15691540.0,71658.321581,15566689.0,15626763.75,15694341.5,15750363.0,15815236.0
Age,400.0,37.655,10.482877,18.0,29.75,37.0,46.0,60.0
EstimatedSalary,400.0,69742.5,34096.960282,15000.0,43000.0,70000.0,88000.0,150000.0
Purchased,400.0,0.3575,0.479864,0.0,0.0,0.0,1.0,1.0


- There are 400 people analyzed.
- The age goes from 18 to 60 years.
- The salary goes from 15000 USD to 150000 USD per year.

In [None]:
pd.isnull(df).sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

There are no missing values

In [None]:
label = ['Not Purchased', 'Purchased']

fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=['Female', 'Male'], values=df['Gender'].value_counts(), name="Gender"), 1, 1)
fig.add_trace(go.Pie(labels=label, values=df['Purchased'][df['Gender'] == 'Male'].value_counts(), name="Male"), 1, 2)
fig.add_trace(go.Pie(labels=label, values=df['Purchased'][df['Gender'] == 'Female'].value_counts(), name="Female"), 1, 3)

fig.update_traces(hole=.4, hoverinfo="label+value+name+percent", textfont_size=16)

fig.update_layout(
    title_text="Gender Distribution",
    annotations=[dict(text='Overall', x=0.123, y=0.5, font_size=15, showarrow=False),
                 dict(text='Male', x=0.5, y=0.5, font_size=15, showarrow=False),
                 dict(text='Female', x=0.895, y=0.5, font_size=15, showarrow=False)])
fig.update_layout(
    font_family="Times New Roman",
    title_font_family="Times New Roman",
    font_size=30
)
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=0.9,
    font_size=30
))
fig.show()

- There are 204 Female Users (51%) and 196 Male Users (49%)
- Out of the total Male, 66.3% (130) didn't purchase the product and 33.7% (66) purchase the product
- Out of the total Female, 62.3% (127) didn't purchase the product and 37.7% (77) purchase the product

This concludes that Female purchases more product whose Ads are shown on Social Media Applications

# Case Study 2 : Telecommunication Churn Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
telcom = pd.read_csv('gdrive/MyDrive/Datasets/telcom.csv')
telcom

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.90,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,414-4276,no,yes,36,156.2,77,26.55,...,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,370-3271,no,no,0,231.1,57,39.29,...,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,328-8230,no,no,0,180.8,109,30.74,...,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,364-6381,yes,no,0,213.8,105,36.35,...,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


In [None]:
telcom.shape

(3333, 21)

**Removal of unneccessary data and Null Values**

In [None]:
# Removing unneccessary data
telcom = telcom.drop(['account length', 'area code', 'phone number', 'customer service calls'], axis=1)
telcom.shape

(3333, 17)

In [None]:
# Checking Null Values
telcom.isnull().sum()

state                    0
international plan       0
voice mail plan          0
number vmail messages    0
total day minutes        0
total day calls          0
total day charge         0
total eve minutes        0
total eve calls          0
total eve charge         0
total night minutes      0
total night calls        0
total night charge       0
total intl minutes       0
total intl calls         0
total intl charge        0
churn                    0
dtype: int64

In [None]:
churn_true = telcom['state'][telcom['churn'] == True]
churn_false = telcom['state'][telcom['churn'] == False]

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=churn_true, values=telcom['state'][telcom['churn'] == True].value_counts(), name="Churned"), 1, 1)
fig.add_trace(go.Pie(labels=churn_false, values=telcom['state'][telcom['churn'] == False].value_counts(), name="Non Churned"), 1, 2)

fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)

fig.update_layout(
    title_text="State Wise Churn Distribution",
    annotations=[dict(text='Churned', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Non Churned', x=0.84, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [None]:
international_labels = ['No International Plan', 'International Plan']
voice_labels = ['No Voice Plan', 'Voice Plan']
churn_labels = ['False', 'True']

fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=international_labels, values=telcom['international plan'].value_counts(), name="International Plan"), 1, 1)
fig.add_trace(go.Pie(labels=voice_labels, values=telcom['voice mail plan'].value_counts(), name="Voice Plan"), 1, 2)
fig.add_trace(go.Pie(labels=churn_labels, values=telcom['churn'].value_counts(), name="Churn"), 1, 3)

fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)

fig.update_layout(
    title_text="International Plan, Voice Mail Plan and Churn Distributions",
    annotations=[dict(text='International Plan', x=0.05, y=0.5, font_size=20, showarrow=False),
                 dict(text='Voice Plan', x=0.5, y=0.5, font_size=20, showarrow=False),
                 dict(text='Churn', x=0.88, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [None]:
telcom = telcom.drop(['state'], axis=1)
telcom.shape

(3333, 16)

**Outlier Detection and Removal**

In [None]:
fig = px.box(telcom, x='churn', y = 'total day charge')

fig.update_yaxes(title_text='Total day charge', row=1, col=1)

fig.update_xaxes(title_text='churn', row=1, col=1)

fig.update_layout(autosize=True, width=750, height=600,
    title_font=dict(size=25, family='Courier'),
    title='<b>total day charge vs Churn</b>',
)

fig.show()

In [None]:
# Removing the outliers
telcom = telcom[telcom['total day charge'] >= 7.63]
telcom = telcom[telcom['total day charge'] <= 52.77]
telcom.shape

(3285, 16)

In [None]:
fig = px.box(telcom, x='churn', y = 'total day charge')

fig.update_yaxes(title_text='Total day charge', row=1, col=1)

fig.update_xaxes(title_text='churn', row=1, col=1)

fig.update_layout(autosize=True, width=750, height=600,
    title_font=dict(size=25, family='Courier'),
    title='<b>total day charge vs Churn</b>',
)

fig.show()

**Encoding**

In [None]:
# Encoding
telcom['international plan'] = telcom['international plan'].map({'yes':1 ,'no':0})
telcom['voice mail plan'] = telcom['voice mail plan'].map({'yes':1 ,'no':0})
telcom

Unnamed: 0,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,churn
0,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,False
1,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,False
2,0,0,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,False
3,1,0,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,False
4,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,0,1,36,156.2,77,26.55,215.5,126,18.32,279.1,83,12.56,9.9,6,2.67,False
3329,0,0,0,231.1,57,39.29,153.4,55,13.04,191.3,123,8.61,9.6,4,2.59,False
3330,0,0,0,180.8,109,30.74,288.8,58,24.55,191.9,91,8.64,14.1,6,3.81,False
3331,1,0,0,213.8,105,36.35,159.6,84,13.57,139.2,137,6.26,5.0,10,1.35,False


**Correlation Analysis**

In [None]:
telcom.corr(method ='pearson')

Unnamed: 0,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,churn
international plan,1.0,0.007891,0.010525,0.048065,0.006911,0.048067,0.018464,0.007967,0.018469,-0.030292,0.01408,-0.030296,0.045437,0.013993,0.04534,0.266238
voice mail plan,0.007891,1.0,0.956767,0.009274,-0.019016,0.009274,0.023841,-0.006231,0.023854,0.002486,0.013525,0.002472,0.002257,0.007704,0.002316,-0.094037
number vmail messages,0.010525,0.956767,1.0,0.011349,-0.016655,0.011347,0.01975,-0.006248,0.019767,0.003886,0.00525,0.003868,0.006584,0.013722,0.006627,-0.081925
total day minutes,0.048065,0.009274,0.011349,1.0,0.012036,1.0,-0.000737,0.013333,-0.000748,0.000656,0.011394,0.00063,-0.013804,-0.001664,-0.013803,0.165206
total day calls,0.006911,-0.019016,-0.016655,0.012036,1.0,0.012042,-0.019376,0.008906,-0.019372,0.021296,-0.023706,0.021285,0.021434,0.008239,0.021539,0.031386
total day charge,0.048067,0.009274,0.011347,1.0,0.012042,1.0,-0.000729,0.013334,-0.000741,0.000657,0.011394,0.000631,-0.013807,-0.001666,-0.013806,0.165205
total eve minutes,0.018464,0.023841,0.01975,-0.000737,-0.019376,-0.000729,1.0,-0.010377,1.0,-0.00978,0.00339,-0.009786,-0.011845,-0.001322,-0.011888,0.092199
total eve calls,0.007967,-0.006231,-0.006248,0.013333,0.008906,0.013334,-0.010377,1.0,-0.010372,-0.002638,0.007153,-0.002606,0.010241,0.017567,0.010214,0.003433
total eve charge,0.018469,0.023854,0.019767,-0.000748,-0.019372,-0.000741,1.0,-0.010372,1.0,-0.009788,0.003401,-0.009795,-0.011855,-0.001321,-0.011897,0.09219
total night minutes,-0.030292,0.002486,0.003886,0.000656,0.021296,0.000657,-0.00978,-0.002638,-0.009788,1.0,0.010089,0.999999,-0.014262,-0.014839,-0.014222,0.034665


In [None]:
# Dropping Highly Correlated columns
telcom = telcom.drop(['total day minutes', 'total eve minutes', 'total night minutes', 'total intl minutes'], axis=1)
telcom

Unnamed: 0,international plan,voice mail plan,number vmail messages,total day calls,total day charge,total eve calls,total eve charge,total night calls,total night charge,total intl calls,total intl charge,churn
0,0,1,25,110,45.07,99,16.78,91,11.01,3,2.70,False
1,0,1,26,123,27.47,103,16.62,103,11.45,3,3.70,False
2,0,0,0,114,41.38,110,10.30,104,7.32,5,3.29,False
3,1,0,0,71,50.90,88,5.26,89,8.86,7,1.78,False
4,1,0,0,113,28.34,122,12.61,121,8.41,3,2.73,False
...,...,...,...,...,...,...,...,...,...,...,...,...
3328,0,1,36,77,26.55,126,18.32,83,12.56,6,2.67,False
3329,0,0,0,57,39.29,55,13.04,123,8.61,4,2.59,False
3330,0,0,0,109,30.74,58,24.55,91,8.64,6,3.81,False
3331,1,0,0,105,36.35,84,13.57,137,6.26,10,1.35,False


#Case Study 3 : Implementing ML Models and testing their performance

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

**Performance Visualization**

In [None]:
def visualization(result_df):
  fig = px.bar(result_df,
             x='Model',
             y='Accuracy')
  fig.update_layout(
      font_family="Times New Roman",
      title_font_family="Times New Roman",
      legend_title='',
      title = "Accuracy of Models"
  )
  fig.update_xaxes(title_font_family="Times New Roman", title='Models')
  fig.update_yaxes(title_font_family="Times New Roman", title='Accuracy')


  fig.show()

**Model Evaluation Function**

In [None]:
from sklearn import metrics

def evaluate(name, y_actual, y_predicted):
    Model_name.append(name)
    Accuracy.append(100*metrics.accuracy_score(y_actual, y_predicted))

**KNN**

In [37]:
def knnmodel(X, Y):
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=None, shuffle=True)
  clf = KNeighborsClassifier(n_neighbors=2)
  clf.fit(X_train,Y_train)

  Y_pred = clf.predict(X)

  evaluate('KNN', Y, Y_pred)

**Naive Bayes**

In [38]:
def naivebyesmodel(X, Y):
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=None, shuffle=True)
  clf = GaussianNB()
  clf.fit(X_train,Y_train)

  Y_pred = clf.predict(X)

  evaluate('Naive Byes', Y, Y_pred)

**Decision Tree**

In [39]:
def decisiontreemodel(X, Y):
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=None, shuffle=True)

  clf = DecisionTreeClassifier()

  # Train Decision Tree Classifer
  clf = clf.fit(X_train,Y_train)

  #Predict the response for test dataset
  Y_pred = clf.predict(X)

  evaluate('Decision Tree', Y, Y_pred)

**Random Forest**

In [40]:
from sklearn.ensemble import RandomForestClassifier

def randomforest(X, Y):
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=None, shuffle=True)

  clf = RandomForestClassifier()
  clf.fit(X_train, Y_train)

  Y_pred = clf.predict(X)

  evaluate('Random Forest ', Y, Y_pred)

**Dataset 1 - Social Network Ads**

In [None]:
df['Gender'] = df['Gender'].map({'Male':int(1) ,'Female':int(0)})
x = df[['Gender', 'Age', 'EstimatedSalary']]
y = df.Purchased

In [41]:
Model_name = []
Accuracy = []

naivebyesmodel(x, y)
knnmodel(x, y)
decisiontreemodel(x, y)
randomforest(x, y)

di = {'Model': Model_name, 'Accuracy': Accuracy}
result_df = pd.DataFrame(di)
print(result_df)

visualization(result_df)

            Model  Accuracy
0      Naive Byes     89.25
1             KNN     86.75
2   Decision Tree     95.50
3  Random Forest      96.75


**Dataset 2 - Telecommunication Churn**

In [43]:
x = telcom[['international plan', 'voice mail plan', 'total day charge', 'total eve charge', 'total night charge', 'total intl charge']]
y = telcom.churn

In [42]:
Model_name = []
Accuracy = []

naivebyesmodel(x, y)
knnmodel(x, y)
decisiontreemodel(x, y)
randomforest(x, y)

di = {'Model': Model_name, 'Accuracy': Accuracy}
result_df = pd.DataFrame(di)
print(result_df)

visualization(result_df)

            Model  Accuracy
0      Naive Byes     89.75
1             KNN     84.50
2   Decision Tree     96.00
3  Random Forest      96.25
