In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

print("="*60)
print("MOBILE PHONE PRICE PREDICTION MODEL")
print("="*60)


In [None]:
# Load the dataset from uploaded CSV file
from google.colab import files
import io

# Upload your CSV file
print("Please upload your dataset.csv file...")
uploaded = files.upload()

# Get the filename
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print(f"✓ Dataset '{filename}' loaded successfully!")


In [None]:
# 1. DATA EXPLORATION
print("\n1. DATASET OVERVIEW")
print("-"*60)
print(f"Dataset Shape: {df.shape}")
print(f"\nColumn Names:\n{df.columns.tolist()}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMissing Values:\n{df.isnull().sum()}")
print(f"\nBasic Statistics:\n{df.describe()}")

# Price range distribution
print(f"\nPrice Range Distribution:")
print(df['price_range'].value_counts().sort_index())


In [None]:
# 2. FEATURE ANALYSIS
print("\n\n2. FEATURE CORRELATION ANALYSIS")
print("-"*60)

# Calculate correlation with target
correlation_with_target = df.corr()['price_range'].sort_values(ascending=False)
print("\nTop 10 Features Correlated with Price:")
print(correlation_with_target.head(11))  # 11 to include target itself


In [None]:
# 3. DATA VISUALIZATION
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Price distribution
axes[0, 0].bar(['Low', 'Medium', 'High', 'Very High'],
               df['price_range'].value_counts().sort_index().values,
               color=['green', 'blue', 'orange', 'red'])
axes[0, 0].set_title('Price Range Distribution', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Count')

# Top features correlation
top_features = correlation_with_target[1:11]
axes[0, 1].barh(range(len(top_features)), top_features.values)
axes[0, 1].set_yticks(range(len(top_features)))
axes[0, 1].set_yticklabels(top_features.index)
axes[0, 1].set_title('Top 10 Features by Correlation', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Correlation Coefficient')

# RAM vs Price (strongest correlation)
axes[1, 0].scatter(df['ram'], df['price_range'], alpha=0.5, c=df['price_range'], cmap='viridis')
axes[1, 0].set_title('RAM vs Price Range', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('RAM (MB)')
axes[1, 0].set_ylabel('Price Range')

# Battery Power vs Price
axes[1, 1].scatter(df['battery_power'], df['price_range'], alpha=0.5, c=df['price_range'], cmap='plasma')
axes[1, 1].set_title('Battery Power vs Price Range', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Battery Power (mAh)')
axes[1, 1].set_ylabel('Price Range')

plt.tight_layout()


In [None]:
# 4. DATA PREPARATION
print("\n\n3. DATA PREPARATION")
print("-"*60)

# Separate features and target
X = df.drop('price_range', axis=1)
y = df['price_range']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# 5. MODEL TRAINING
print("\n\n4. MODEL TRAINING")
print("-"*60)

# Random Forest Classifier (optimal for this task)
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest Classifier...")
model.fit(X_train_scaled, y_train)
print("✓ Training complete!")


In [None]:
# 6. MODEL EVALUATION
print("\n\n5. MODEL EVALUATION")
print("-"*60)

# Predictions
y_pred = model.predict(X_test_scaled)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy*100:.2f}%")

# Cross-validation score
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean()*100:.2f}% (+/- {cv_scores.std()*100:.2f}%)")

# Classification Report
print("\nClassification Report:")
print("-"*60)
target_names = ['Low Cost', 'Medium Cost', 'High Cost', 'Very High Cost']
print(classification_report(y_test, y_pred, target_names=target_names))

# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Visualize Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()

In [None]:
# 7. FEATURE IMPORTANCE
print("\n\n6. FEATURE IMPORTANCE")
print("-"*60)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(range(10), feature_importance.head(10)['importance'].values)
plt.yticks(range(10), feature_importance.head(10)['feature'].values)
plt.xlabel('Importance Score')
plt.title('Top 10 Feature Importance', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()

In [None]:
# 8. PREDICTION FUNCTION
print("\n\n7. SAMPLE PREDICTIONS")
print("-"*60)

def predict_price(battery_power, blue, clock_speed, dual_sim, fc, four_g,
                  int_memory, m_dep, mobile_wt, n_cores, pc, px_height,
                  px_width, ram, sc_h, sc_w, talk_time, three_g,
                  touch_screen, wifi):
    """
    Predict mobile phone price range
    Returns: 0=Low, 1=Medium, 2=High, 3=Very High
    """
    features = np.array([[battery_power, blue, clock_speed, dual_sim, fc, four_g,
                         int_memory, m_dep, mobile_wt, n_cores, pc, px_height,
                         px_width, ram, sc_h, sc_w, talk_time, three_g,
                         touch_screen, wifi]])

    features_scaled = scaler.transform(features)
    prediction = model.predict(features_scaled)[0]

    price_labels = {0: 'Low Cost', 1: 'Medium Cost', 2: 'High Cost', 3: 'Very High Cost'}
    return price_labels[prediction]

# Test with sample data
sample_phones = [
    {
        'name': 'Budget Phone',
        'specs': (842, 0, 2.2, 0, 1, 0, 7, 0.6, 188, 2, 2, 20, 756, 2549, 9, 7, 19, 0, 0, 1)
    },
    {
        'name': 'Premium Phone',
        'specs': (1021, 1, 2.5, 1, 5, 1, 64, 0.7, 136, 8, 20, 1920, 1080, 3500, 15, 8, 10, 1, 1, 1)
    }
]

for phone in sample_phones:
    prediction = predict_price(*phone['specs'])
    print(f"\n{phone['name']}: Predicted as '{prediction}'")

print("\n" + "="*60)
print("MODEL TRAINING COMPLETE!")
print("="*60)
print(f"\nFinal Model Accuracy: {accuracy*100:.2f}%")
print("\nModel is ready for predictions!")