In [None]:
# Data Manipulation & Numerical Operations
import pandas as pd  
import numpy as np  

# For ignoring warinings
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split  
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [269]:
Dataset = pd.read_csv("../dataset/Mobiles Dataset (2025).csv", encoding='latin1')

In [270]:
Dataset

Unnamed: 0,Company Name,Model Name,Mobile Weight,RAM,Front Camera,Back Camera,Processor,Battery Capacity,Screen Size,Launched Price (Pakistan),Launched Price (India),Launched Price (China),Launched Price (USA),Launched Price (Dubai),Launched Year
0,Apple,iPhone 16 128GB,174g,6GB,12MP,48MP,A17 Bionic,"3,600mAh",6.1 inches,"PKR 224,999","INR 79,999","CNY 5,799",USD 799,"AED 2,799",2024
1,Apple,iPhone 16 256GB,174g,6GB,12MP,48MP,A17 Bionic,"3,600mAh",6.1 inches,"PKR 234,999","INR 84,999","CNY 6,099",USD 849,"AED 2,999",2024
2,Apple,iPhone 16 512GB,174g,6GB,12MP,48MP,A17 Bionic,"3,600mAh",6.1 inches,"PKR 244,999","INR 89,999","CNY 6,499",USD 899,"AED 3,199",2024
3,Apple,iPhone 16 Plus 128GB,203g,6GB,12MP,48MP,A17 Bionic,"4,200mAh",6.7 inches,"PKR 249,999","INR 89,999","CNY 6,199",USD 899,"AED 3,199",2024
4,Apple,iPhone 16 Plus 256GB,203g,6GB,12MP,48MP,A17 Bionic,"4,200mAh",6.7 inches,"PKR 259,999","INR 94,999","CNY 6,499",USD 949,"AED 3,399",2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
925,Poco,Pad 5G 128GB,571g,8GB,8MP,8MP,Snapdragon 7s Gen 2,"10,000mAh",12.1 inches,"PKR 66,220","INR 23,999","CNY 2,099",USD 280,"AED 1,029",2024
926,Poco,Pad 5G 256GB,571g,8GB,8MP,8MP,Snapdragon 7s Gen 2,"10,000mAh",12.1 inches,"PKR 71,220","INR 25,999","CNY 2,299",USD 300,"AED 1,099",2024
927,Samsung,Galaxy Z Fold6 256GB,239g,12GB,"10MP, 4MP (UDC)",50MP,Snapdragon 8 Gen 3,4400mAh,7.6 inches,"PKR 604,999","INR 164,999","¥13,999","USD 1,899","AED 7,199",2024
928,Samsung,Galaxy Z Fold6 512GB,239g,12GB,"10MP, 4MP (UDC)",50MP,Snapdragon 8 Gen 3,4400mAh,7.6 inches,"PKR 544,999","INR 176,999","CNY 15,999",USD 1719,"AED 7,699",2024


# Data Preprocessing

In [271]:
# renaming columns names
Dataset.rename(columns={
    "Mobile Weight": "Weight(g)", 
    "RAM": "RAM(GB)", 
    "Front Camera": "Front(MP)", 
    "Back Camera": "Back(MP)", 
    "Battery Capacity": "Battery(mAh)", 
    "Screen Size": "Size(inch)",    
}, inplace=True)

In [272]:
# removing units from dataset values
import re
def ConvertValue(value, unit=None):
    number = re.sub(r'[^\d.]', '', value)
    
    if not number: return np.nan
    
    parts = number.split('.')
    
    if len(parts) > 1: number = f"{parts[0]}.{''.join(parts[1:])}" 
    else: number = parts[0]
    
    if '.' in number: number = float(number)
    else: number = int(number)
        
    if unit == 'PKR': number *= 0.003
    elif unit == 'INR': number *= 0.01
    elif unit == 'CNY': number *= 0.14
    elif unit == 'AED': number *= 0.27
    return number

Dataset['RAM(GB)'] = Dataset['RAM(GB)'].apply(lambda x: ConvertValue(x))
Dataset['Battery(mAh)'] = Dataset['Battery(mAh)'].apply(lambda x: ConvertValue(x))
Dataset['Size(inch)'] = Dataset['Size(inch)'].apply(lambda x: ConvertValue(x))
Dataset['Back(MP)'] = Dataset['Back(MP)'].apply(lambda x: ConvertValue(x))
Dataset['Front(MP)'] = Dataset['Front(MP)'].apply(lambda x: ConvertValue(x))
Dataset['Weight(g)'] = Dataset['Weight(g)'].apply(lambda x: ConvertValue(x))
# Converting all the launched prices into global currency(Dollar)
Dataset['Launched Price (USA)'] = Dataset['Launched Price (USA)'].apply(lambda x: ConvertValue(x))
Dataset['Launched Price (Pakistan)'] = Dataset['Launched Price (Pakistan)'].apply(lambda x: ConvertValue(x, 'PKR'))
Dataset['Launched Price (India)'] = Dataset['Launched Price (India)'].apply(lambda x: ConvertValue(x, 'INR'))
Dataset['Launched Price (China)'] = Dataset['Launched Price (China)'].apply(lambda x: ConvertValue(x, 'CNY'))
Dataset['Launched Price (Dubai)'] = Dataset['Launched Price (Dubai)'].apply(lambda x: ConvertValue(x, 'AED'))
# Adding average price columns in dataset
Dataset["Avg_Price_USD"] = Dataset[
    ["Launched Price (Pakistan)", "Launched Price (India)", "Launched Price (China)", "Launched Price (Dubai)", "Launched Price (USA)"]
].mean(axis=1)


In [273]:
# removing na values
Dataset = Dataset.dropna()

In [274]:
# removing duplicate rows from dataset
Dataset = Dataset.drop_duplicates()

In [275]:
# general filter, removing outliers and uninteresting values for analysis
Dataset = Dataset[~Dataset['RAM(GB)'].isin([812.0, 1.5, 10.0, 1.0, 2.0])]

# Exporting user input fields

In [276]:
import json

user_input = {}

for i in range(0, len(Dataset.columns) - 1):
    # Convert numpy types to native Python types for JSON serialization
    unique_vals = Dataset[Dataset.columns[i]].unique().tolist()
    user_input[Dataset.columns[i]] = [str(val) if isinstance(val, (np.generic, bytes)) else val for val in unique_vals]

with open('user_input.json', 'w') as f:
    json.dump(user_input, f, indent=2)

# Model for predicting price of Smart phones in india

In [277]:
Dataset

Unnamed: 0,Company Name,Model Name,Weight(g),RAM(GB),Front(MP),Back(MP),Processor,Battery(mAh),Size(inch),Launched Price (Pakistan),Launched Price (India),Launched Price (China),Launched Price (USA),Launched Price (Dubai),Launched Year,Avg_Price_USD
0,Apple,iPhone 16 128GB,174.0,6.0,12.0,48.0,A17 Bionic,3600,6.10,674.997,799.99,811.86,799.0,755.73,2024,768.3154
1,Apple,iPhone 16 256GB,174.0,6.0,12.0,48.0,A17 Bionic,3600,6.10,704.997,849.99,853.86,849.0,809.73,2024,813.5154
2,Apple,iPhone 16 512GB,174.0,6.0,12.0,48.0,A17 Bionic,3600,6.10,734.997,899.99,909.86,899.0,863.73,2024,861.5154
3,Apple,iPhone 16 Plus 128GB,203.0,6.0,12.0,48.0,A17 Bionic,4200,6.70,749.997,899.99,867.86,899.0,863.73,2024,856.1154
4,Apple,iPhone 16 Plus 256GB,203.0,6.0,12.0,48.0,A17 Bionic,4200,6.70,779.997,949.99,909.86,949.0,917.73,2024,901.3154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
924,POCO,M7 5G 128GB,198.0,6.0,8.0,50.0,MediaTek Dimensity 7025,5110,6.67,119.997,159.99,223.86,229.0,242.73,2024,195.1154
925,Poco,Pad 5G 128GB,571.0,8.0,8.0,8.0,Snapdragon 7s Gen 2,10000,12.10,198.660,239.99,293.86,280.0,277.83,2024,258.0680
926,Poco,Pad 5G 256GB,571.0,8.0,8.0,8.0,Snapdragon 7s Gen 2,10000,12.10,213.660,259.99,321.86,300.0,296.73,2024,278.4480
927,Samsung,Galaxy Z Fold6 256GB,239.0,12.0,104.0,50.0,Snapdragon 8 Gen 3,4400,7.60,1814.997,1649.99,1959.86,1899.0,1943.73,2024,1853.5154


In [278]:
# Removing some columns
Dataset = Dataset.drop(columns=['Company Name', 'Model Name', 'Processor', 'Launched Year', 'Launched Price (Pakistan)', 'Launched Price (China)', 'Launched Price (USA)', 'Launched Price (Dubai)', 'Launched Price (India)'])

In [279]:
Dataset

Unnamed: 0,Weight(g),RAM(GB),Front(MP),Back(MP),Battery(mAh),Size(inch),Avg_Price_USD
0,174.0,6.0,12.0,48.0,3600,6.10,768.3154
1,174.0,6.0,12.0,48.0,3600,6.10,813.5154
2,174.0,6.0,12.0,48.0,3600,6.10,861.5154
3,203.0,6.0,12.0,48.0,4200,6.70,856.1154
4,203.0,6.0,12.0,48.0,4200,6.70,901.3154
...,...,...,...,...,...,...,...
924,198.0,6.0,8.0,50.0,5110,6.67,195.1154
925,571.0,8.0,8.0,8.0,10000,12.10,258.0680
926,571.0,8.0,8.0,8.0,10000,12.10,278.4480
927,239.0,12.0,104.0,50.0,4400,7.60,1853.5154


In [280]:
# removing rows where ram is 3
Dataset = Dataset[Dataset['RAM(GB)'] != 3]

# removing rows where battery is out of realistic market range
Dataset = Dataset[(Dataset['Battery(mAh)'] >= 2500) & (Dataset['Battery(mAh)'] <= 7000)]

# For rear camera (assuming column is Back(MP))
Dataset = Dataset[(Dataset['Back(MP)'] >= 8) & (Dataset['Back(MP)'] <= 108)]

# For front camera (assuming column is 'Front(MP)')
Dataset = Dataset[(Dataset['Front(MP)'] >= 5) & (Dataset['Front(MP)'] <= 32)]

# Weight filter (assuming column is 'weight')
Dataset = Dataset[(Dataset['Weight(g)'] >= 120) & (Dataset['Weight(g)'] <= 250)]

# Size filter (assuming column is 'screen_size')
Dataset = Dataset[(Dataset['Size(inch)'] >= 4.7) & (Dataset['Size(inch)'] <= 7.0)]

In [281]:
Dataset

Unnamed: 0,Weight(g),RAM(GB),Front(MP),Back(MP),Battery(mAh),Size(inch),Avg_Price_USD
0,174.0,6.0,12.0,48.0,3600,6.10,768.3154
1,174.0,6.0,12.0,48.0,3600,6.10,813.5154
2,174.0,6.0,12.0,48.0,3600,6.10,861.5154
3,203.0,6.0,12.0,48.0,4200,6.70,856.1154
4,203.0,6.0,12.0,48.0,4200,6.70,901.3154
...,...,...,...,...,...,...,...
920,210.0,8.0,20.0,108.0,5160,6.67,471.7154
921,190.0,4.0,5.0,50.0,5000,6.50,127.1154
922,195.0,6.0,16.0,64.0,5000,6.67,277.1154
923,207.0,8.0,20.0,108.0,6000,6.67,340.3154


In [None]:
# Exporting user input field 
import json

user_input = {}

for i in range(0, len(Dataset.columns) - 1):
    # Convert numpy types to native Python types for JSON serialization
    unique_vals = Dataset[Dataset.columns[i]].unique().tolist()
    user_input[Dataset.columns[i]] = [str(val) if isinstance(val, (np.generic, bytes)) else val for val in unique_vals]

with open('user_input.json', 'w') as f:
    json.dump(user_input, f, indent=2)

In [283]:
# Selecting features (independent variables) & target variable
features = ['RAM(GB)', 'Battery(mAh)', 'Size(inch)', 'Front(MP)', 'Back(MP)', 'Weight(g)']
X = Dataset[features]
y = Dataset['Avg_Price_USD'] 

In [284]:
# Split dataset into training & testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [285]:
from sklearn.model_selection import cross_val_score

# Initialize model
rf = RandomForestRegressor(n_estimators=200, random_state=42)

# Train model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)


# Use cross-validation with R² score as the metric
scores = cross_val_score(rf, X_train, y_train, cv=10, scoring='r2')
print(f"R² Cross-Validation: Mean = {np.mean(scores):.4f}, Scores = {scores}")

# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")


R² Cross-Validation: Mean = 0.8719, Scores = [0.87999016 0.76616442 0.84172155 0.93014616 0.88517463 0.81467708
 0.94961932 0.81131708 0.8920818  0.94765198]
MAE: 83.6081642585034
MSE: 14880.415018618849
RMSE: 121.98530656853247
R² Score: 0.7618485861101549


# Exporting model in pickle file

In [286]:
import pickle
with open('model.pickle','wb') as f:
    pickle.dump(rf,f)