# Project Megaline Customers for Plan Upgrades

### The goal of our project is to use customer plan information from Smart and Ultimate customers who have switched plans. Then various data elements describing their usage behaviors and apply them to predict which plans may suit them best.

In [51]:
#!pip install scikit-learn
#!pip install joblib

In [52]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [53]:
url = 'https://practicum-content.s3.us-west-1.amazonaws.com/datasets/users_behavior.csv'
df = pd.read_csv(url)

In [54]:
display(df.info())  
display(df.describe()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


None

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
count,3214.0,3214.0,3214.0,3214.0,3214.0
mean,63.038892,438.208787,38.281269,17207.673836,0.306472
std,33.236368,234.569872,36.148326,7570.968246,0.4611
min,0.0,0.0,0.0,0.0,0.0
25%,40.0,274.575,9.0,12491.9025,0.0
50%,62.0,430.6,30.0,16943.235,0.0
75%,82.0,571.9275,57.0,21424.7,1.0
max,244.0,1632.06,224.0,49745.73,1.0


In [55]:
# check all min values are not 0
zero_count = df[(df[['calls', 'minutes', 'messages', 'mb_used']] == 0).all(axis=1)].shape[0]

print("Number of rows where all columns have a value of zero:", zero_count)

Number of rows where all columns have a value of zero: 0


In [56]:
# count for any duplicates
num_duplicates = df.duplicated().sum()
# print number of duplicate rows 
print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


### All data conforms to accepted stadards and no duplicates values were found. 

In [57]:
# split the data into features and target 
features = df[['calls', 'minutes', 'messages', 'mb_used']]
target = df['is_ultra']

In [58]:
# split the data into training (60%), validation (20%), and test (20%) sets
features_train, features_temp, target_train, target_temp = train_test_split(features, target, test_size=0.4)
features_valid, features_test, target_valid, target_test = train_test_split(features_temp, target_temp, test_size=0.5)

print("Training set size:", features_train.shape)
print("Validation set size:", features_valid.shape)
print("Test set size:", features_test.shape)

Training set size: (1928, 4)
Validation set size: (643, 4)
Test set size: (643, 4)


### The ideal ratio when splitting source data into three parts is 3:1:1 for training, validation, and testing. This leaves us with 1928 records for training and 643 records for validation and testing.

## Decision Tree Regression

In [59]:
# initialize decision tree model
best_model_tree = None
best_result_tree = 1
best_depth_tree = 0

# loop to iterate over max_depth
for depth in range(1, 11):
    model = DecisionTreeRegressor(random_state=12345, max_depth=depth)
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    rmse = np.sqrt(mean_squared_error(target_valid, predictions_valid))
    if rmse < best_result_tree:
        best_model_tree = model
        best_result_tree = rmse
        best_depth_tree = depth

print(f"Best validation RMSE: {best_result_tree} with max_depth: {best_depth_tree}")

# evaluate the best model on the test set
test_predictions_tree = best_model_tree.predict(features_test)
test_rmse_tree = np.sqrt(mean_squared_error(target_test, test_predictions_tree))
#print(f"RMSE of the Decision Tree model on the test set: {test_rmse_tree}")

Best validation RMSE: 0.4042780303405697 with max_depth: 5


## Linear Regression

In [60]:
# initialize the linear regression model
model = LinearRegression()
model.fit(features_train, target_train)
predictions_valid = model.predict(features_valid)

# calculate the RMSE
rmse = np.sqrt(mean_squared_error(target_valid, predictions_valid))

print("Best validation RMSE:", rmse)

# evaluate the best model on the test set
test_predictions_linear = model.predict(features_test)
test_rmse_linear = np.sqrt(mean_squared_error(target_test, test_predictions_linear))
#print("RMSE of the Linear Regression model on the test set:", test_rmse_linear)

Best validation RMSE: 0.4454561808852696


## Random Forest Regression

In [61]:

# initialize random forest
best_model_tree = None
best_result_forest = 1
best_est_forest = 0
best_depth = 0

# loop to iterate over n_estimators and max_depth
for est in range(5, 51, 5):
    for depth in range(1, 11):
        model = RandomForestRegressor(random_state=12345, n_estimators=est, max_depth=depth)
        model.fit(features_train, target_train)
        predictions_valid = model.predict(features_valid)
        rmse = np.sqrt(mean_squared_error(target_valid, predictions_valid))
        if rmse < best_result_forest:
            best_model_forest = model
            best_result_forest = rmse
            best_est_forest = est
            best_depth = depth

print(f"Best validation RMSE: {best_result_forest} with n_estimators: {best_est_forest} and max_depth: {best_depth}")

# evaluate the best model on the test set
test_predictions_forest = best_model_forest.predict(features_test)
test_rmse_forest = np.sqrt(mean_squared_error(target_test, test_predictions_forest))
#print(f"RMSE of the Random Forest Regression model on the test set: {test_rmse}")


Best validation RMSE: 0.37574436793812704 with n_estimators: 10 and max_depth: 9


## Compare each model against the Test Data Set

In [62]:
# Evaluate the best model
print(f"RMSE of the Decision Tree model on the test set: {test_rmse_tree}")
print("RMSE of the Linear Regression model on the test set:", test_rmse_linear)
print(f"RMSE of the Random Forest Regression model on the test set: {test_rmse_forest}")

RMSE of the Decision Tree model on the test set: 0.40127475304446336
RMSE of the Linear Regression model on the test set: 0.44408053555898447
RMSE of the Random Forest Regression model on the test set: 0.3887451026869719


### In both our validation and testing the Random Forest model has the highest accuracy base on RMSE values.  We do see the Decision tree drop below 40 percent against our test data, howest it's still not as accurate or perform consistently as well at the Random Forest model. 

# Reccomendation 

### Given the limits placed on our Random Forest estimation range, it only took several seconds longer to perform the desired calculations.  This model is the preferred approach to calculate which users might be interested in upgrading their plan to Ultimate.  As we learned in our first project, upgrading customers to the Ultimate plan should also generates more revenue for the Megaline company.

#### All project files are available at Github : https://github.com/Tom-Kinstle/sprint_7