In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
import pickle

In [4]:
# Define the path to the Excel file
excel_file = r'D:\iti\iti recording\_final project\python\PredectiveModel_Date.xlsx'

# Load data from Excel file into a DataFrame
data = pd.read_excel(excel_file)

# Display the first few rows of the DataFrame
data.head()

Unnamed: 0,Gender,Senior Citizen,Partner,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Churn Score,Streaming Movies,Churn Value
0,1,0,0,1,0,0,1,1,0,0,0,86,0,1
1,0,0,0,1,0,1,0,0,0,0,0,67,0,1
2,0,0,0,1,1,1,0,0,1,0,1,86,1,1
3,0,0,1,1,1,1,0,0,1,1,1,84,1,1
4,1,0,0,1,1,1,0,1,1,0,1,89,1,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4835 entries, 0 to 4834
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Gender             4835 non-null   int64
 1   Senior Citizen     4835 non-null   int64
 2   Partner            4835 non-null   int64
 3   Phone Service      4835 non-null   int64
 4   Multiple Lines     4835 non-null   int64
 5   Internet Service   4835 non-null   int64
 6   Online Security    4835 non-null   int64
 7   Online Backup      4835 non-null   int64
 8   Device Protection  4835 non-null   int64
 9   Tech Support       4835 non-null   int64
 10  Streaming TV       4835 non-null   int64
 11  Churn Score        4835 non-null   int64
 12  Streaming Movies   4835 non-null   int64
 13  Churn Value        4835 non-null   int64
dtypes: int64(14)
memory usage: 529.0 KB


In [6]:
data.isna().sum()  # Check for missing values

Gender               0
Senior Citizen       0
Partner              0
Phone Service        0
Multiple Lines       0
Internet Service     0
Online Security      0
Online Backup        0
Device Protection    0
Tech Support         0
Streaming TV         0
Churn Score          0
Streaming Movies     0
Churn Value          0
dtype: int64

In [7]:
data.describe()  # Obtain statistical summary: numeric data

Unnamed: 0,Gender,Senior Citizen,Partner,Phone Service,Multiple Lines,Internet Service,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Churn Score,Streaming Movies,Churn Value
count,4835.0,4835.0,4835.0,4835.0,4835.0,4835.0,4835.0,4835.0,4835.0,4835.0,4835.0,4835.0,4835.0,4835.0
mean,0.501551,0.20393,0.48666,1.0,0.543744,0.640331,0.359049,0.44302,0.437849,0.362978,0.500931,60.803102,0.503413,0.328025
std,0.500049,0.402959,0.499874,0.0,0.498134,0.479953,0.479771,0.496794,0.496174,0.480908,0.500051,21.662571,0.50004,0.469542
min,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.0,0.0,0.0
50%,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,65.0,1.0,0.0
75%,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,77.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,1.0,1.0


In [8]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Splitting Dataset into testset and training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [9]:
# Feature Scalling 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [11]:
LR_c = LogisticRegression()
RF_c = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
KNN_c = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
SVM_c = SVC(kernel = 'rbf', random_state = 0)

classifier = [LR_c,RF_c,KNN_c,SVM_c]
for cl in classifier:
    cl.fit(X_train,y_train)

In [12]:
for cl in classifier:
    pred = cl.predict(X_test)
    print(cl," accuracy is : ",accuracy_score(y_test,pred))
    print(cl,"confusion matrix-")
    print(confusion_matrix(y_test,pred))
    print()

LogisticRegression()  accuracy is :  0.8751033912324235
LogisticRegression() confusion matrix-
[[720  84]
 [ 67 338]]

RandomForestClassifier(criterion='entropy', n_estimators=10)  accuracy is :  0.8610421836228288
RandomForestClassifier(criterion='entropy', n_estimators=10) confusion matrix-
[[726  78]
 [ 90 315]]

KNeighborsClassifier()  accuracy is :  0.815550041356493
KNeighborsClassifier() confusion matrix-
[[691 113]
 [110 295]]

SVC(random_state=0)  accuracy is :  0.8684863523573201
SVC(random_state=0) confusion matrix-
[[719  85]
 [ 74 331]]



In [None]:
pickle.dump(LR_c, open('LR_c.pkl','wb'))

In [36]:
def predication_churn(Gender, SeniorCitizen, Partner, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, ChurnScore, StreamingMovies):
    # Transform input features into a 2D array
    features = [[Gender, SeniorCitizen, Partner, PhoneService, MultipleLines, InternetService, OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, ChurnScore, StreamingMovies]]

    # Ensure that your scaler (sc) is already fit on your training data before transforming
    features_scaled = sc.transform(features)

    # Predict churn using LR_c model
    churn = LR_c.predict(features_scaled)

    if churn == 1:
        print("Customer may be churn.")
    else:
        print("Customer may not be churn until now.")

# Example usage:
predication_churn(1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 70, 1)


Customer may be churn.
