# Project Goal:

**Develop a classification model to recommend whether a Megaline user should switch to the "Smart" or "Ultra" plan based on their monthly usage behavior. The target accuracy was ≥ 0.75**

In [13]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.dummy import DummyClassifier

**Load the dataset**

In [14]:
df = pd.read_csv('/datasets/users_behavior.csv')

**Data checks**

In [15]:
# Display the first 5 rows
print(df.head())
# Show data types and missing values
print(df.info())
# Summary statistics
print(df.describe())

   calls  minutes  messages   mb_used  is_ultra
0   40.0   311.90      83.0  19915.42         0
1   85.0   516.75      56.0  22696.96         0
2   77.0   467.66      86.0  21060.45         0
3  106.0   745.53      81.0   8437.39         1
4   66.0   418.74       1.0  14502.75         0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB
None
             calls      minutes     messages       mb_used     is_ultra
count  3214.000000  3214.000000  3214.000000   3214.000000  3214.000000
mean     63.038892   438.208787    38.281269  17207.673836     0.306472
std      33.236368   234.569872    36.148326   7570.968246  

**Split the dataset:**

In [16]:
#Split Use 60% for training, 20% for validation, and 20% for testing.
df_train, df_left = train_test_split(df, test_size=0.40, random_state=12345)
df_valid, df_test = train_test_split(df_left, test_size=0.5, random_state=12345)

In [17]:
# Separate features and target for each subset
features_train = df_train.drop(['is_ultra'],axis=1)
target_train = df_train['is_ultra']

features_valid = df_valid.drop(['is_ultra'],axis=1)
target_valid = df_valid['is_ultra']

features_test = df_test.drop(['is_ultra'],axis=1)
target_test = df_test['is_ultra']

**Train and evaluate each model with different max_depth values**

In [18]:
# Decision Tree
dt_best_result = 0
dt_best_model = None
for depth in range(1, 11):
    model = DecisionTreeClassifier(max_depth=depth, class_weight='balanced', random_state=12345)
    model.fit(features_train, target_train)
    preds = model.predict(features_valid)
    dt_accuracy = accuracy_score(target_valid, preds)
    if dt_accuracy > dt_best_result:
        dt_best_result = dt_accuracy
        dt_best_model = model
print(f"Accuracy of the Decision Tree best model: {dt_best_result:.4f}")

Accuracy of the Decision Tree best model: 0.7838


In [19]:
# Random Forest
rf_best_result = 0
rf_best_model = None
for depth in range(1, 11):
    model = RandomForestClassifier(max_depth=depth, class_weight='balanced', random_state=12345)
    model.fit(features_train, target_train)
    preds = model.predict(features_valid)
    rf_accuracy = accuracy_score(target_valid, preds)
    if rf_accuracy > rf_best_result:
        rf_best_result = rf_accuracy
        rf_best_model = model
print(f"Accuracy of the Random Forest best model: {rf_best_result:.4f}")

Accuracy of the Random Forest best model: 0.7885


In [20]:
# Logistic Regression
lr_best_result = 0
best_c = None
lr_best_model = None
for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(C=c, max_iter=1000, class_weight='balanced', random_state=12345)
    model.fit(features_train, target_train)
    predictions = model.predict(features_valid)
    lr_accuracy = accuracy_score(target_valid, predictions)   
    if lr_accuracy > lr_best_result:
        lr_best_result = lr_accuracy
        lr_best_model = model
print(f"Accuracy of the Logistic Regression model: {lr_best_result:.4f}")

Accuracy of the Logistic Regression model: 0.6174


**Investigate the quality of different models and Select the best model based on validation accuracy**

In [21]:
# Compare the best results from each model
if rf_best_result >= dt_best_result and rf_best_result >= lr_best_result:
    final_model = rf_best_model
    print("Selected Model: Random Forest")
elif dt_best_result >= rf_best_result and dt_best_result >= lr_best_result:
    final_model = dt_best_model
    print("Selected Model: Decision Tree")
else:
    final_model = lr_best_model
    print("Selected Model: Logistic Regression")

Selected Model: Random Forest


**Evaluate the selected best model on the test set**

In [22]:
# Evaluate on test set
final_test_pred = final_model.predict(features_test)
print("Test Accuracy:", accuracy_score(target_test, final_test_pred))
print("Classification Report:")
print(classification_report(target_test, final_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(target_test, final_test_pred))


# Baseline model for comparison
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy="most_frequent", random_state=12345)
dummy.fit(features_train, target_train)
baseline_accuracy = dummy.score(features_test, target_test)
print("Baseline accuracy (most frequent strategy):", baseline_accuracy)

Test Accuracy: 0.7962674961119751
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86       440
           1       0.72      0.58      0.64       203

    accuracy                           0.80       643
   macro avg       0.77      0.74      0.75       643
weighted avg       0.79      0.80      0.79       643

Confusion Matrix:
[[394  46]
 [ 85 118]]
Baseline accuracy (most frequent strategy): 0.6842923794712286


# Conclusion:

**The final Random Forest model meets the project requirement of ≥ 0.75 accuracy.
It outperforms a baseline model and was improved further by handling class imbalance.
The modeling pipeline includes proper data splitting, hyperparameter tuning, model comparison, and evaluation, ensuring a strong and justifiable solution.**