In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder,OrdinalEncoder,RobustScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.metrics import confusion_matrix,precision_score,recall_score,accuracy_score,classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv("student.csv")
df.head()


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [None]:
def performance_level(score):
    if score < 10:
        return "Low"
    elif score < 15:
        return "Medium"
    else:
        return "High"

df["performance"] = df["G3"].apply(performance_level)


In [None]:
le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col] = le.fit_transform(df[col])


In [None]:
X = df.drop(["G3", "performance"], axis=1)
y = df["performance"]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [None]:
model = LogisticRegression(multi_class="multinomial",solver="lbfgs",max_iter=5000)
model.fit(X_train, y_train)



In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8888888888888888
Confusion Matrix:
 [[20  0  3]
 [ 0 33  3]
 [ 1  4 35]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.87      0.91        23
           1       0.89      0.92      0.90        36
           2       0.85      0.88      0.86        40

    accuracy                           0.89        99
   macro avg       0.90      0.89      0.89        99
weighted avg       0.89      0.89      0.89        99



In [None]:
num_col=X.select_dtypes(include='number').columns
cat_col=X.select_dtypes(exclude='number').columns

In [None]:
preprocessing=ColumnTransformer(
    transformers=[
        ('scaler',StandardScaler(),num_col),
        ('onehotencoder',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),cat_col),
    ]
)

In [None]:
decisiontreepipeline=Pipeline(
    steps=[
        ('preprocessing',preprocessing),
        ('model',DecisionTreeRegressor())
    ]
)
decisiontreepipeline.fit(X_train,y_train)

In [None]:
print("Train_score:",decisiontreepipeline.score(X_train,y_train))
print("Test_score:",decisiontreepipeline.score(X_test,y_test))

Train_score: 1.0
Test_score: 0.4840282447881641


#IMPUTER

#Churn=Yes(1)->Custom left the telecom service
#Churn=No(0)->Custom continued the telecom service

In [None]:
df=pd.read_csv("/content/telco_data.csv")
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1.0,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34.0,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2.0,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45.0,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2.0,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24.0,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72.0,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11.0,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            6293 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           6043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            4543 non-null   float64
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   6043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       5543 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [None]:
df=df.drop(columns=['customerID'])

def clean_charges(val):
  val=str(val).strip()
  if val=='':
    return None
  return float(val)

df['TotalCharges']=df['TotalCharges'].apply(clean_charges)

In [None]:
x=df.drop('Churn',axis=1)
y=df['Churn']

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,train_size=0.8,random_state=42)

In [None]:
num_col=x.select_dtypes(include='number').columns
obj_col=x.select_dtypes(exclude='number').columns

In [None]:
# df['TotalCharges']=df['TotalCharges'].str.strip()
# df['TotalCharges']=df['TotalCharges'].replace('',None)
# df['TotalCharges'].astype({'TotalCharges':'float'})

###SIMPLE IMPUTER:
* It replace the missing values with help of strategy (mean,median,most_frequent) or constant with fill_value
* It accepts only 2D

In [None]:
# imputer=SimpleImputer(strategy='most_frequent')
# imputer=SimpleImputer(strategy='constant',fill_value='UnKnown')

In [None]:
# df[['gender']]=imputer.fit_transform(df[['gender']])

In [None]:
# df[df['gender'].isna()]
# df[['gender']].value_counts()

In [None]:
# imputer.transform(df[['Partner']])

In [None]:
num_preprocessing=Pipeline(
    steps=[
        ('imputer_for_numcols',SimpleImputer(strategy='mean')),
         ('standardscaler',StandardScaler())
    ]
)
num_preprocessing.fit_transform(xtrain[num_col])
# pipeline.fit(obj_col)
# pipeline.transform()

array([[-0.4377492 ,  0.        ,  0.        , -0.42210502],
       [-0.4377492 ,  0.        ,  0.        ,  1.25536015],
       [-0.4377492 ,  0.        ,  0.        , -1.00299144],
       ...,
       [-0.4377492 , -0.93381312, -1.46464784, -0.87799925],
       [ 2.28441306, -0.93381312,  1.15806793, -0.48254445],
       [-0.4377492 , -0.29753207, -1.50986708, -0.81110232]])

In [None]:
cat_preprocessing=Pipeline(
    steps=[
        ('imputer_for_objcols',SimpleImputer(strategy='constant',fill_value='Unknown')),
         ('ordinalencoder',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ]
)
cat_preprocessing.fit_transform(xtrain[obj_col])

In [None]:
preprocessing=ColumnTransformer(
    transformers=[
        ('num_preprocessing',num_preprocessing,num_col),
         ('cat_preprocessing',cat_preprocessing,obj_col)
    ]
)
preprocessing.fit_transform(xtrain)


array([[-0.4377492 ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  3.        ],
       [-0.4377492 ,  0.        ,  0.        , ...,  2.        ,
         1.        ,  0.        ],
       [-0.4377492 ,  0.        ,  0.        , ...,  0.        ,
         1.        ,  2.        ],
       ...,
       [-0.4377492 , -0.93381312, -1.46464784, ...,  0.        ,
         1.        ,  2.        ],
       [ 2.28441306, -0.93381312,  1.15806793, ...,  0.        ,
         1.        ,  2.        ],
       [-0.4377492 , -0.29753207, -1.50986708, ...,  1.        ,
         0.        ,  1.        ]])

In [None]:
pipeline=Pipeline(
    steps=[
        ('preprocessing',preprocessing),
        ('model',LogisticRegression())
    ]
)
pipeline.fit(xtrain,ytrain)
pipeline.named_steps['model']

In [None]:
decisiontreepipeline=Pipeline(
    steps=[
        ('preprocessing',preprocessing),
        ('model',DecisionTreeClassifier())
    ]
)
decisiontreepipeline.fit(xtrain,ytrain)

In [None]:
print("Train_score:",decisiontreepipeline.score(xtrain,ytrain))
print("Test_score:",decisiontreepipeline.score(xtest,ytest))

Train_score: 0.998757543485978
Test_score: 0.6941092973740242


In [None]:
grid_search_cv=GridSearchCV(
    estimator=pipeline,
    param_grid={
        'model__C':[0.01,0.1,1.0,10],
        'model__penalty':['l1','l2'],
        'model__solver':['liblinear'],
        'model__class_weight':['balanced']
    },
    cv=10,
    n_jobs=-1,
    verbose=1,
    scoring='f1_macro'
)

In [None]:
grid_search_cv.fit(xtrain,ytrain)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


In [None]:
grid_search_cv.score(xtrain,ytrain)

0.7120140852180937

In [None]:
grid_search_cv.best_params_

{'model__C': 0.1,
 'model__class_weight': 'balanced',
 'model__penalty': 'l2',
 'model__solver': 'liblinear'}

In [None]:
model=grid_search_cv.best_estimator_

In [None]:
model.fit(xtrain,ytrain)

In [None]:
model.score(xtrain,ytrain)

0.7412140575079872

In [None]:
grid_search_cv.best_score_

np.float64(0.7088434266097943)

In [None]:
cv_result=pd.DataFrame(grid_search_cv.cv_results_)

In [None]:
cv_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__C,param_model__class_weight,param_model__penalty,param_model__solver,params,split0_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.088058,0.005378,0.027888,0.007118,0.01,balanced,l1,liblinear,"{'model__C': 0.01, 'model__class_weight': 'bal...",0.721296,...,0.67392,0.700266,0.71725,0.696636,0.676754,0.727019,0.692522,0.698923,0.018514,8
1,0.087155,0.00415,0.024671,0.001292,0.01,balanced,l2,liblinear,"{'model__C': 0.01, 'model__class_weight': 'bal...",0.72851,...,0.711996,0.70347,0.713267,0.713377,0.679713,0.732067,0.687789,0.70561,0.016436,7
2,0.13636,0.010823,0.025384,0.002943,0.1,balanced,l1,liblinear,"{'model__C': 0.1, 'model__class_weight': 'bala...",0.730555,...,0.707938,0.708102,0.70682,0.71109,0.684627,0.727019,0.690179,0.705749,0.014723,6
3,0.090849,0.003962,0.024782,0.002442,0.1,balanced,l2,liblinear,"{'model__C': 0.1, 'model__class_weight': 'bala...",0.73285,...,0.71438,0.704887,0.710897,0.716613,0.683812,0.722244,0.698934,0.708843,0.013001,1
4,0.152899,0.010973,0.025392,0.00438,1.0,balanced,l1,liblinear,"{'model__C': 1.0, 'model__class_weight': 'bala...",0.727291,...,0.710301,0.712704,0.706067,0.71432,0.686139,0.717366,0.700532,0.707285,0.012902,3
5,0.097615,0.008393,0.023249,0.000678,1.0,balanced,l2,liblinear,"{'model__C': 1.0, 'model__class_weight': 'bala...",0.728922,...,0.710301,0.712704,0.706067,0.71432,0.68456,0.717366,0.700532,0.707448,0.013157,2
6,0.153339,0.009399,0.0258,0.004043,10.0,balanced,l1,liblinear,"{'model__C': 10, 'model__class_weight': 'balan...",0.728922,...,0.710301,0.712704,0.706067,0.716613,0.68456,0.715746,0.698934,0.70704,0.013814,4
7,0.090846,0.002664,0.022307,0.0026,10.0,balanced,l2,liblinear,"{'model__C': 10, 'model__class_weight': 'balan...",0.728922,...,0.710301,0.712704,0.706067,0.716613,0.68456,0.715746,0.698934,0.70704,0.013814,4
