In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

In [23]:
data = pd.read_csv(r"C:\Users\revan\Downloads\phone_usage_india.csv")

In [25]:
# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

Dataset Preview:
  User ID  Age  Gender   Location Phone Brand       OS  Screen Time (hrs/day)  \
0  U00001   53    Male     Mumbai        Vivo  Android                    3.7   
1  U00002   60   Other      Delhi      Realme      iOS                    9.2   
2  U00003   37  Female  Ahmedabad       Nokia  Android                    4.5   
3  U00004   32    Male       Pune     Samsung  Android                   11.0   
4  U00005   16    Male     Mumbai      Xiaomi      iOS                    2.2   

   Data Usage (GB/month)  Calls Duration (mins/day)  Number of Apps Installed  \
0                   23.9                       37.9                       104   
1                   28.1                       13.7                       169   
2                   12.3                       66.8                        96   
3                   25.6                      156.2                       146   
4                    2.5                      236.2                        86   

   Social

In [27]:
data.tail()

Unnamed: 0,User ID,Age,Gender,Location,Phone Brand,OS,Screen Time (hrs/day),Data Usage (GB/month),Calls Duration (mins/day),Number of Apps Installed,Social Media Time (hrs/day),E-commerce Spend (INR/month),Streaming Time (hrs/day),Gaming Time (hrs/day),Monthly Recharge Cost (INR),Primary Use
17681,U17682,27,Other,Mumbai,Apple,iOS,7.2,36.6,90.4,81,5.1,4007,3.8,3.3,1380,Entertainment
17682,U17683,40,Female,Chennai,Oppo,iOS,9.5,12.9,243.0,166,4.3,8550,3.4,0.7,222,Education
17683,U17684,34,Female,Ahmedabad,Realme,Android,1.1,48.9,74.7,70,5.3,5516,0.7,4.0,1612,Entertainment
17684,U17685,22,Male,Hyderabad,Vivo,Android,8.8,25.6,105.6,96,2.4,3614,6.6,0.8,1528,Work
17685,U17686,43,Other,Kolkata,Oppo,iOS,5.6,22.5,215.2,78,5.1,5332,0.8,3.6,1098,Gaming


In [29]:
data.describe()

Unnamed: 0,Age,Screen Time (hrs/day),Data Usage (GB/month),Calls Duration (mins/day),Number of Apps Installed,Social Media Time (hrs/day),E-commerce Spend (INR/month),Streaming Time (hrs/day),Gaming Time (hrs/day),Monthly Recharge Cost (INR)
count,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0,17686.0
mean,37.584247,6.546376,25.411257,151.405846,104.584869,3.252369,5075.707848,4.250616,2.490874,1042.785367
std,13.338252,3.172677,14.122167,84.923353,55.217097,1.590223,2871.604841,2.155683,1.446003,552.502067
min,15.0,1.0,1.0,5.0,10.0,0.5,100.0,0.5,0.0,100.0
25%,26.0,3.8,13.2,77.325,57.0,1.9,2587.5,2.4,1.2,561.0
50%,38.0,6.6,25.3,150.6,104.0,3.2,5052.0,4.2,2.5,1040.0
75%,49.0,9.3,37.6,223.9,152.0,4.6,7606.0,6.1,3.7,1521.75
max,60.0,12.0,50.0,300.0,200.0,6.0,10000.0,8.0,5.0,2000.0


In [31]:
data.shape

(17686, 16)

In [33]:
data.columns

Index(['User ID', 'Age', 'Gender', 'Location', 'Phone Brand', 'OS',
       'Screen Time (hrs/day)', 'Data Usage (GB/month)',
       'Calls Duration (mins/day)', 'Number of Apps Installed',
       'Social Media Time (hrs/day)', 'E-commerce Spend (INR/month)',
       'Streaming Time (hrs/day)', 'Gaming Time (hrs/day)',
       'Monthly Recharge Cost (INR)', 'Primary Use'],
      dtype='object')

In [35]:
data["Monthly Recharge Cost (INR)"].value_counts()

Monthly Recharge Cost (INR)
921     24
1939    20
258     20
953     20
316     19
        ..
1717     2
606      2
1447     2
1421     2
1187     1
Name: count, Length: 1901, dtype: int64

In [27]:
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
User ID                         0
Age                             0
Gender                          0
Location                        0
Phone Brand                     0
OS                              0
Screen Time (hrs/day)           0
Data Usage (GB/month)           0
Calls Duration (mins/day)       0
Number of Apps Installed        0
Social Media Time (hrs/day)     0
E-commerce Spend (INR/month)    0
Streaming Time (hrs/day)        0
Gaming Time (hrs/day)           0
Monthly Recharge Cost (INR)     0
Primary Use                     0
dtype: int64


In [29]:
# Data summary
print("\nDataset Information:")
print(data.info())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17686 entries, 0 to 17685
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   User ID                       17686 non-null  object 
 1   Age                           17686 non-null  int64  
 2   Gender                        17686 non-null  object 
 3   Location                      17686 non-null  object 
 4   Phone Brand                   17686 non-null  object 
 5   OS                            17686 non-null  object 
 6   Screen Time (hrs/day)         17686 non-null  float64
 7   Data Usage (GB/month)         17686 non-null  float64
 8   Calls Duration (mins/day)     17686 non-null  float64
 9   Number of Apps Installed      17686 non-null  int64  
 10  Social Media Time (hrs/day)   17686 non-null  float64
 11  E-commerce Spend (INR/month)  17686 non-null  int64  
 12  Streaming Time (hrs/day)      17686 no

In [45]:
# Preprocessing
# Handle missing values (if any)
data = data.dropna()  # Drop rows with missing values (or use imputation if needed)

In [53]:
# Define feature variables (X) and target variable (y)
target_column = "Social Media Time (hrs/day)"  # The correct column name as a string
X = data.drop(columns=[target_column])  # Drop the target column from the features
y = data[target_column]  # Set the target variable

In [55]:
# Encoding categorical variables (if any exist)
X = pd.get_dummies(X, drop_first=True)

In [57]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
# Initialize the Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)

In [61]:
# Train the model
dt_regressor.fit(X_train, y_train)

In [63]:
# Predict on the test set
y_pred = dt_regressor.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
print("\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2 Score): {r2}")

In [None]:
pre_value =data['Phone Brand'].value_counts()
pre_value

In [None]:
sns.countplot(data,x='Phone Brand',color='yellowblue')

In [None]:
# Visualize the decision tree
plt.figure(figsize=(15, 10))
plot_tree(dt_regressor, feature_names=X.columns, filled=True, fontsize=10)
plt.title("Decision Tree Regression Visualization")
plt.show()