## data loading and prepration

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [3]:
df1 = pd.read_csv('/content/telecom_usage.csv')
df2 = pd.read_csv('/content/telecom_demographics.csv')

In [4]:
churn_df = pd.merge(df1, df2, on='customer_id')

In [5]:
churn_df.head()

Unnamed: 0,customer_id,calls_made,sms_sent,data_used,churn,telecom_partner,gender,age,state,city,pincode,registration_event,num_dependents,estimated_salary
0,15169,75,21,4532,1,Airtel,F,26,Himachal Pradesh,Delhi,667173,2020-03-16,4,85979
1,149207,35,38,723,1,Airtel,F,74,Uttarakhand,Hyderabad,313997,2022-01-16,0,69445
2,148119,70,47,4688,1,Airtel,F,54,Jharkhand,Chennai,549925,2022-01-11,2,75949
3,187288,95,32,10241,1,Reliance Jio,M,29,Bihar,Hyderabad,230636,2022-07-26,3,34272
4,14016,66,23,5246,1,Vodafone,M,45,Nagaland,Bangalore,188036,2020-03-11,4,34157


## applying scaling through column transform

In [6]:
churn_df.columns

Index(['customer_id', 'calls_made', 'sms_sent', 'data_used', 'churn',
       'telecom_partner', 'gender', 'age', 'state', 'city', 'pincode',
       'registration_event', 'num_dependents', 'estimated_salary'],
      dtype='object')

In [7]:
# Define categorical and numerical columns
categorical_features = ['telecom_partner', 'gender', 'state', 'city', 'registration_event']
numerical_features = ['calls_made', 'sms_sent', 'data_used', 'age', 'pincode', 'num_dependents', 'estimated_salary']

# Create column transformer with OneHotEncoder for categorical features and StandardScaler for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [9]:
# Apply the transformations
features = churn_df.drop('churn', axis=1)
target = churn_df['churn']

features_scaled = preprocessor.fit_transform(features)

In [10]:
# Display the scaled features
print(features_scaled)

  (0, 0)	0.846075853060938
  (0, 1)	-0.22238450720259403
  (0, 2)	-0.15948802391326633
  (0, 3)	-1.2229697881162898
  (0, 4)	0.45493602916447207
  (0, 5)	1.4365388699639015
  (0, 6)	0.011981184225627477
  (0, 7)	1.0
  (0, 11)	1.0
  (0, 21)	1.0
  (0, 43)	1.0
  (0, 122)	1.0
  (1, 0)	-0.49634439857967455
  (1, 1)	0.938055993642072
  (1, 2)	-1.4548963020853525
  (1, 3)	1.6963038021724683
  (1, 4)	-0.9041947281849755
  (1, 5)	-1.411346041897436
  (1, 6)	-0.428423410129078
  (1, 7)	1.0
  (1, 11)	1.0
  (1, 39)	1.0
  (1, 44)	1.0
  (1, 790)	1.0
  (2, 0)	0.6782733216058614
  :	:
  (6497, 249)	1.0
  (6498, 0)	0.07418420836758581
  (6498, 1)	-1.1097801843191033
  (6498, 2)	0.6237425522564145
  (6498, 3)	-1.2229697881162898
  (6498, 4)	0.8973146649731012
  (6498, 5)	0.7245676419985672
  (6498, 6)	0.9117021463710999
  (6498, 10)	1.0
  (6498, 12)	1.0
  (6498, 28)	1.0
  (6498, 42)	1.0
  (6498, 372)	1.0
  (6499, 0)	1.3494834474261677
  (6499, 1)	-0.8367353605909467
  (6499, 2)	0.1802633202437077
  (649

In [11]:
# Verify the shape
print("Shape of Scaled Features:", features_scaled.shape)
print("Shape of Target:", target.shape)

Shape of Scaled Features: (6500, 1263)
Shape of Target: (6500,)


## spliting data

In [12]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets with an 80-20 split
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (5200, 1263)
Shape of X_test: (1300, 1263)
Shape of y_train: (5200,)
Shape of y_test: (1300,)


## both models

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train Logistic Regression model
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)
logreg_pred = logreg.predict(X_test)

# Train Random Forest Classifier model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# Assess the models on test data
logreg_accuracy = accuracy_score(y_test, logreg_pred)
rf_accuracy = accuracy_score(y_test, rf_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
print(f"Logistic Regression Accuracy: {logreg_accuracy}")
print(f"Random Forest Accuracy: {rf_accuracy}")

Logistic Regression Accuracy: 0.7838461538461539
Random Forest Accuracy: 0.7876923076923077


## evaluating which one shows higher accuracy

In [15]:
if logreg_accuracy > rf_accuracy:
    higher_accuracy = "LogisticRegression"
else:
    higher_accuracy = "RandomForest"
print(f"Higher Accuracy Model: {higher_accuracy}")

Higher Accuracy Model: RandomForest
