In [None]:
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, boxcox
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb

# Load dataset
df = pd.read_csv("smartphones_data.csv")
df1 = df.copy()
print("Shape of dataset:", df1.shape)

# Data exploration
df1.describe()
print("Missing values:\n", df1.isnull().sum())

# Drop duplicates
df1.drop_duplicates(inplace=True)
print("Shape after removing duplicates:", df1.shape)

# Fix column names
print("Original columns:", df1.columns)
df1.rename(columns={'primery_rear_camera': 'primary_rear_camera',
                     'primery_front_camera': 'primary_front_camera'}, inplace=True)
print("Fixed columns:", df1.columns)

# Calculate missing percentage
missing_percentage = (df1.isnull().sum() / len(df1)) * 100
print("Missing percentage:\n", missing_percentage)

# Fill missing values
df1['has_fingerprints'].fillna(df1['has_fingerprints'].mode()[0], inplace=True)
df1['has_nfc'].fillna(df1['has_nfc'].mode()[0], inplace=True)
df1['has_5g'].fillna(df1['has_5g'].mode()[0], inplace=True)
df1['num_core'].fillna(df1['num_core'].median(), inplace=True)
df1['refresh_rate(hz)'].fillna(df1['refresh_rate(hz)'].median(), inplace=True)

# Print unique values of categorical features
print("Unique brand names:", df1['brand_name'].unique())  
print("Unique OS:", df1['OS'].unique())  
print("Unique processor brands:", df1['processor_brand'].unique())  
print("Unique display types:", df1['display_types'].unique())

# Standardize categorical values
df1['brand_name'] = df1['brand_name'].replace({'moto': 'motorola', 'Other': 'other'})
df1['processor_brand'] = df1['processor_brand'].replace({'tru-mediatek': 'mediatek',
                                                         'huawei': 'hisilicon',
                                                         'quad': 'other',
                                                         'spreadtrum': 'unisoc',
                                                         'st-ericsson': 'other'})

# Visualize distribution of brands
plt.figure(figsize=(10, 4))
sns.countplot(data=df1, x='brand_name', color='blue')
plt.title("Number of Smartphones by Brand")
plt.xlabel("Brand")
plt.ylabel("Count")
plt.xticks(rotation=50)
plt.show()

# Check skewness of numeric columns
numeric_cols = df1.select_dtypes(include=['number'])
skewness_values = numeric_cols.apply(skew)
print("Skewness values:\n", skewness_values)

# Define numeric columns for analysis
numeric_cols = ['Price', 'RAM', 'storage', 'Battery_cap', 'num_core',
                'primary_rear_camera', 'Num_Rear_Cameras', 'primary_front_camera',
                'num_front_camera', 'display_size(inch)', 'refresh_rate(hz)']

# Create boxplots for numeric columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(4, 3, i)
    sns.boxplot(y=df1[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

# Create correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df1[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Features")
plt.show()

# Create histograms for numeric columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(4, 3, i)
    sns.histplot(df1[col], kde=True, bins=30)
    plt.title(f'Histogram of {col}')
plt.tight_layout()
plt.show()

# Make a copy of Price for transformations
original_price = df1['Price'].copy()

# Apply Box-Cox transformation (fixed: handle non-positive values)
# Ensure all prices are positive for Box-Cox transformation
min_price = df1['Price'].min()
if min_price <= 0:
    df1['Price_BoxCox_Input'] = df1['Price'] + abs(min_price) + 1
else:
    df1['Price_BoxCox_Input'] = df1['Price']

# Apply Box-Cox transformation
df1['Price_BoxCox'], lambda_val = boxcox(df1['Price_BoxCox_Input'])

# Apply Log transformation separately for comparison
df1['Price_Log'] = np.log1p(df1['Price'])

# Visualize transformations
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.histplot(df1['Price'], bins=30, kde=True, color="red", edgecolor="black")
plt.title("Original Price Distribution")
plt.xlabel("Price")

plt.subplot(1, 3, 2)
sns.histplot(df1['Price_Log'], bins=30, kde=True, color="green", edgecolor="black")
plt.title("Log-Transformed Price Distribution")
plt.xlabel("Price_Log")

plt.subplot(1, 3, 3)
sns.histplot(df1['Price_BoxCox'], bins=30, kde=True, color="blue", edgecolor="black")
plt.title("Box-Cox Transformed Price Distribution")
plt.xlabel("Price_BoxCox")
plt.show()

# Choose the best transformation (Box-Cox seems better)
df1['Price'] = df1['Price_BoxCox']
df1.drop(['Price_BoxCox', 'Price_Log', 'Price_BoxCox_Input'], axis=1, inplace=True)

# Transform skewed numeric features
skewed_cols = ['RAM', 'storage', 'Battery_cap', 'primary_rear_camera', 'primary_front_camera']
for col in skewed_cols:
    if skew(df1[col]) > 0.5:  # Apply if skewness > 0.5
        df1[col] = np.log1p(df1[col])

# Function to cap outliers using IQR
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
    return df

# Apply to relevant columns
outlier_cols = ['Price', 'RAM', 'storage', 'Battery_cap', 'primary_rear_camera', 'primary_front_camera']
for col in outlier_cols:
    df1 = cap_outliers(df1, col)

# Create categorical features
df1['RAM_category'] = pd.cut(df1['RAM'],
                           bins=[0, np.log1p(4), np.log1p(8), np.log1p(12), np.log1p(24)], 
                           labels=['low', 'medium', 'high', 'very_high'])
df1['RAM_category'] = df1['RAM_category'].map({'low': 0, 'medium': 1, 'high': 2, 'very_high': 3}).fillna(0).astype(int)

df1['battery_category'] = pd.cut(df1['Battery_cap'], 
                               bins=[0, np.log1p(3500), np.log1p(4500), np.log1p(6000)], 
                               labels=['small', 'medium', 'big'])
df1['battery_category'] = df1['battery_category'].map({'small': 0, 'medium': 1, 'big': 2}).fillna(0).astype(int)

# Encode binary columns
binary_cols = ['has_fast_charging', 'has_fingerprints', 'has_nfc', 'has_5g']
for col in binary_cols:
    df1[col] = df1[col].map({'Yes': 1, 'No': 0})

# Check the data types before encoding
print("\nData types before encoding:")
print(df1.dtypes)

# Encode categorical columns - FIXED: Use get_dummies correctly
categorical_cols = ['brand_name', 'OS', 'processor_brand', 'display_types']

# First, check if these columns exist in the dataframe
for col in categorical_cols: 
    if col not in df1.columns:
        print(f"Warning: Column {col} not found in the dataframe")

# Properly encode all categorical columns
df_encoded = pd.get_dummies(df1, columns=categorical_cols, drop_first=True)

# Verify that all object columns are now encoded
print("\nData types after encoding:")
print(df_encoded.dtypes)
object_cols = df_encoded.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    print(f"Warning: There are still object columns: {object_cols}")
    # Convert any remaining object columns to numeric if possible
    for col in object_cols:
        try:
            df_encoded[col] = pd.to_numeric(df_encoded[col])
        except:
            print(f"Could not convert {col} to numeric, dropping it")
            df_encoded = df_encoded.drop(col, axis=1)

# Split into features and target
X = df_encoded.drop('Price', axis=1)
y = df_encoded['Price']

# Check for any remaining non-numeric columns
print("\nFinal check for non-numeric columns:")
non_numeric = X.select_dtypes(exclude=['number']).columns
if len(non_numeric) > 0:
    print(f"Non-numeric columns found: {non_numeric}")
    # Drop these columns as a last resort
    X = X.drop(non_numeric, axis=1)
else:
    print("All columns are numeric - ready for XGBoost!")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining XGBoost model...")
# Train XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Inverse transform predictions and actual values for evaluation
# If using Box-Cox, we need to inverse transform
from scipy.special import inv_boxcox
y_pred_original = inv_boxcox(y_pred, lambda_val)
y_test_original = inv_boxcox(y_test, lambda_val)

# Evaluate model
print("\nModel Evaluation:")
print(f"Mean Absolute Error: ${mean_absolute_error(y_test_original, y_pred_original):.2f}")
print(f"Root Mean Squared Error: ${np.sqrt(mean_squared_error(y_test_original, y_pred_original)):.2f}")
print(f"R-squared: {r2_score(y_test_original, y_pred_original):.4f}")

# Feature importance
plt.figure(figsize=(12, 8))
feature_importance = xgb_model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
importance_df = importance_df.sort_values('importance', ascending=False).head(20)

plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # Display the most important feature at the top
plt.show()

# Hyperparameter Tuning (optional)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Uncomment to perform grid search (warning: can be time-consuming)
# grid_search = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror', random_state=42),
#                           param_grid=param_grid,
#                           cv=5, scoring='neg_mean_squared_error', verbose=1)
# grid_search.fit(X_train, y_train)
# print("Best parameters:", grid_search.best_params_)
# best_model = grid_search.best_estimator_

# Visualize predictions vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test_original, y_pred_original, alpha=0.5)
plt.plot([min(y_test_original), max(y_test_original)], [min(y_test_original), max(y_test_original)], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Smartphone Prices')
plt.show() '''

In [None]:


print("Missing values after imputation:")
print(df1.isnull().sum())

# Check unique values in categorical columns
print("Unique brands:", df1['brand_name'].unique())
print("Unique OS types:", df1['OS'].unique())
print("Unique processor brands:", df1['processor_brand'].unique())
print("Unique display types:", df1['display_types'].unique())


print("Updated processor brands:", df1['processor_brand'].unique())

# Drop the Name column (unnecessary for prediction)
df1.drop('Name', axis=1, inplace=True)

# Data visualization - uncomment if needed
"""
# Brand distribution
plt.figure(figsize=(10, 4))
sns.countplot(data=df1, x='brand_name', color='blue')
plt.title("Number of Smartphones by Brand")
plt.xlabel("Brand")
plt.ylabel("Count")
plt.xticks(rotation=50)
plt.show()

# Price distribution
plt.figure(figsize=(10, 6))
plt.hist(df1['Price'], bins=20, color='skyblue', edgecolor='k', alpha=0.7)
plt.title('Distribution of Phone Prices (INR)')
plt.xlabel('Price in INR')
plt.ylabel('Frequency')
plt.show()

# Calculate skewness of numeric columns
numeric_cols = df1.select_dtypes(include=['number'])
skewness_values = numeric_cols.apply(skew)
print("Skewness of numeric features:")
print(skewness_values)

# Boxplots of numeric columns
numeric_cols_for_plots = ['Price', 'RAM', 'storage', 'Battery_cap', 'num_core',
                          'primary_rear_camera', 'Num_Rear_Cameras', 'primary_front_camera',
                          'num_front_camera', 'display_size(inch)', 'refresh_rate(hz)']
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_cols_for_plots, 1):
    plt.subplot(4, 3, i)
    sns.boxplot(y=df1[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df1[numeric_cols_for_plots].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Numerical Features")
plt.show()

# Histograms of numeric columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numeric_cols_for_plots, 1):
    plt.subplot(4, 3, i)
    sns.histplot(df1[col], kde=True, bins=30)
    plt.title(f'Histogram of {col}')
plt.tight_layout()
plt.show()
"""

# Save original price for reference
original_price = df1['Price'].copy()

# Prepare data for Box-Cox transformation (ensure all values are positive)
min_price = df1['Price'].min()
if min_price <= 0:
    df1['Price_BoxCox_Input'] = df1['Price'] + abs(min_price) + 1
else:
    df1['Price_BoxCox_Input'] = df1['Price']

# Apply Box-Cox transformation and store lambda value
df1['Price_BoxCox'], lambda_boxcox = boxcox(df1['Price_BoxCox_Input'])

# Apply Log transformation for comparison
df1['Price_Log'] = np.log1p(df1['Price'])

# Visualize transformations - uncomment if needed
"""
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.histplot(df1['Price'], bins=30, kde=True, color="red", edgecolor="black")
plt.title("Original Price Distribution")
plt.xlabel("Price")

plt.subplot(1, 3, 2)
sns.histplot(df1['Price_Log'], bins=30, kde=True, color="green", edgecolor="black")
plt.title("Log-Transformed Price Distribution")
plt.xlabel("Price_Log")

plt.subplot(1, 3, 3)
sns.histplot(df1['Price_BoxCox'], bins=30, kde=True, color="blue", edgecolor="black")
plt.title("Box-Cox Transformed Price Distribution")
plt.xlabel("Price_BoxCox")
plt.show()
"""

# Choose the Box-Cox transformation for price
df1['Price_Original'] = df1['Price']  # Save the original price
df1['Price'] = df1['Price_BoxCox']  # Replace with transformed price

# Transform skewed features using log transformation
skewed_cols = ['RAM', 'storage', 'Battery_cap', 'primary_rear_camera', 'primary_front_camera']
for col in skewed_cols:
    if skew(df1[col]) > 0.5:  # Apply if skewness > 0.5
        df1[col] = np.log1p(df1[col])

print("Skewness after transformation:")
print(df1[skewed_cols].skew())

# Function to cap outliers using IQR
def cap_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
    return df

# Apply to relevant columns
outlier_cols = ['Price', 'RAM', 'storage', 'Battery_cap', 'primary_rear_camera', 'primary_front_camera']
for col in outlier_cols:
    df1 = cap_outliers(df1, col)

# Feature engineering
df1['RAM_category'] = pd.cut(df1['RAM'], 
                            bins=[0, np.log1p(4), np.log1p(8), np.log1p(12), np.log1p(24)], 
                            labels=['low', 'medium', 'high', 'very_high'])

df1['RAM_category'] = df1['RAM_category'].map({'low': 0, 'medium': 1, 'high': 2, 'very_high': 3}).fillna(0).astype(int)

df1['camera_quality'] = df1['primary_rear_camera'] * df1['Num_Rear_Cameras'].astype(float)

df1['battery_category'] = pd.cut(df1['Battery_cap'], 
                                bins=[0, 3500, 4500, 6000], 
                                labels=['small', 'medium', 'big'])

df1['battery_category'] = df1['battery_category'].map({'small': 0, 'medium': 1, 'big': 2}).fillna(0).astype(int)

# Convert binary columns to numeric
binary_cols = ['has_fast_charging', 'has_fingerprints', 'has_nfc', 'has_5g']
for col in binary_cols:
    df1[col] = df1[col].map({'Yes': 1, 'No': 0})

# Encode categorical columns
categorical_cols = ['brand_name', 'OS', 'processor_brand', 'display_types']
df_encoded = pd.get_dummies(df1, columns=categorical_cols, drop_first=True)

# Check for any remaining object columns
object_cols = df_encoded.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    print(f"Warning: There are still object columns: {object_cols}")
    # Convert any remaining object columns to numeric if possible
    for col in object_cols:
        try:
            df_encoded[col] = pd.to_numeric(df_encoded[col])
        except:
            print(f"Could not convert {col} to numeric, dropping it")
            df_encoded = df_encoded.drop(col, axis=1)

# Clean up temporary transformation columns
df_encoded.drop(['Price_BoxCox_Input', 'Price_Log'], axis=1, errors='ignore', inplace=True)

# Split data into features and target
X = df_encoded.drop(['Price', 'Price_Original'], axis=1, errors='ignore')
y = df_encoded['Price']  # Using the transformed price

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check for any remaining non-numeric columns
non_numeric = X_train.select_dtypes(exclude=['number']).columns
if len(non_numeric) > 0:
    print(f"Non-numeric columns found: {non_numeric}")
    X_train = X_train.drop(non_numeric, axis=1)
    X_test = X_test.drop(non_numeric, axis=1)
else:
    print("All columns are numeric - ready for XGBoost!")

print("\nTraining XGBoost model...")
# Train XGBoost model
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)

# Make predictions on test set (still in transformed space)
y_pred_transformed = xgb_model.predict(X_test)

# Convert predictions back to original scale
y_pred_original = inv_boxcox(y_pred_transformed, lambda_boxcox)

# Get original scale of test data for evaluation
y_test_original = df_encoded.loc[y_test.index, 'Price_Original']

# Model evaluation
print("\nModel Evaluation:")
print(f"Mean Absolute Error: INR {mean_absolute_error(y_test_original, y_pred_original):.2f}")
print(f"Root Mean Squared Error: INR {np.sqrt(mean_squared_error(y_test_original, y_pred_original)):.2f}")
print(f"R-squared: {r2_score(y_test_original, y_pred_original):.4f}")

# Feature importance
plt.figure(figsize=(12, 8))
feature_importance = xgb_model.feature_importances_
importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importance})
importance_df = importance_df.sort_values('importance', ascending=False).head(20)

plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # Display the most important feature at the top
plt.show()

# Optional: Cross-validation for more robust evaluation
from sklearn.model_selection import cross_val_score
print("\nPerforming 5-fold cross-validation...")
cv_scores = cross_val_score(xgb_model, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-cv_scores)
print(f"Cross-validation RMSE scores: {rmse_scores}")
print(f"Mean RMSE: {rmse_scores.mean():.4f}, Std Dev: {rmse_scores.std():.4f}")

# Function to make predictions for new smartphones
def predict_smartphone_price(model, features_dict, lambda_val):
    # Convert features to DataFrame
    features_df = pd.DataFrame([features_dict])
    
    # Apply same transformations as training data
    for col in skewed_cols:
        if col in features_df.columns and features_df[col].iloc[0] > 0:
            features_df[col] = np.log1p(features_df[col])
    
    # Create dummy variables for categorical features
    for col in categorical_cols:
        if col in features_df.columns:
            features_df = pd.get_dummies(features_df, columns=[col], drop_first=True)
    
    # Align features with training data
    missing_cols = set(X.columns) - set(features_df.columns)
    for col in missing_cols:
        features_df[col] = 0
    
    # Ensure columns are in the same order
    features_df = features_df[X.columns]
    
    # Make prediction
    pred_transformed = model.predict(features_df)[0]
    
    # Convert back to original scale
    pred_original = inv_boxcox(pred_transformed, lambda_val)
    
    return pred_original

# Example usage (uncomment to test)
"""
sample_phone = {
    'RAM': 8,
    'storage': 128,
    'Battery_cap': 5000,
    'has_fast_charging': 1,
    'has_fingerprints': 1,
    'has_nfc': 1,
    'has_5g': 1,
    'num_core': 8,
    'primary_rear_camera': 64,
    'Num_Rear_Cameras': 4,
    'primary_front_camera': 16,
    'num_front_camera': 1,
    'display_size(inch)': 6.5,
    'refresh_rate(hz)': 120,
    'brand_name': 'samsung',
    'OS': 'android',
    'processor_brand': 'snapdragon',
    'display_types': 'amoled display'
}

predicted_price = predict_smartphone_price(xgb_model, sample_phone, lambda_boxcox)
print(f"\nPredicted price for the sample smartphone: INR {predicted_price:.2f}")
"""

In [None]:

# Apply to relevant columns
outlier_cols = ['Price_Transformed', 'RAM', 'storage', 'Battery_cap', 'primary_rear_camera', 'primary_front_camera']
for col in outlier_cols:
    df1 = cap_outliers(df1, col)

# Feature engineering
# Create RAM category
df1['RAM_category'] = pd.cut(df1['RAM'], 
                           bins=[0, np.log1p(4), np.log1p(8), np.log1p(12), np.log1p(24)], 
                           labels=['low', 'medium', 'high', 'very_high'])
df1['RAM_category'] = df1['RAM_category'].map({
    'low': 0, 'medium': 1, 'high': 2, 'very_high': 3
}).fillna(0).astype(int)

# Create camera quality feature
df1['camera_quality'] = df1['primary_rear_camera'] * df1['Num_Rear_Cameras'].astype(float)

# Create battery category - FIXED: Use transformed values 
df1['battery_category'] = pd.cut(df1['Battery_cap'], 
                               bins=[0, np.log1p(3500), np.log1p(4500), np.log1p(6000)], 
                               labels=['small', 'medium', 'big'])
df1['battery_category'] = df1['battery_category'].map({
    'small': 0, 'medium': 1, 'big': 2
}).fillna(0).astype(int)

# Encode binary columns
binary_cols = ['has_fast_charging', 'has_fingerprints', 'has_nfc', 'has_5g']
for col in binary_cols:
    df1[col] = df1[col].map({'Yes': 1, 'No': 0})

# FIXED: Remove the 'Name' column early since it's not useful for modeling
df1 = df1.drop('Name', axis=1, errors='ignore')

# Check data types before encoding
print("\nData types before encoding:")
print(df1.dtypes)

# Encode categorical columns - Do this ONCE
categorical_cols = ['brand_name', 'OS', 'processor_brand', 'display_types']

# Check if these columns exist
for col in categorical_cols: 
    if col not in df1.columns:
        print(f"Warning: Column {col} not found in the dataframe")

# Encode categorical columns
df_encoded = pd.get_dummies(df1, columns=categorical_cols, drop_first=True)

# Keep track of original price
df_encoded['Original_Price'] = original_price

# Clean up price columns - remove intermediate ones
df_encoded = df_encoded.drop(['Price', 'Price_BoxCox', 'Price_Log', 'Price_BoxCox_Input'], axis=1, errors='ignore')

# Check for any remaining object columns
print("\nData types after encoding:")
print(df_encoded.dtypes)

object_cols = df_encoded.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    print(f"Warning: There are still object columns: {object_cols}")
    for col in object_cols:
        try:
            df_encoded[col] = pd.to_numeric(df_encoded[col])
        except:
            print(f"Could not convert {col} to numeric, dropping it")
            df_encoded = df_encoded.drop(col, axis=1)

# Final check for object columns
object_cols = df_encoded.select_dtypes(include=['object']).columns
if len(object_cols) > 0:
    print(f"Still have object columns after conversion: {object_cols}")
    df_encoded = df_encoded.drop(object_cols, axis=1)

# Split data into features and target
X = df_encoded.drop(['Price_Transformed', 'Original_Price'], axis=1, errors='ignore')
y = df_encoded['Price_Transformed']  # Use transformed price for modeling

# Final check for non-numeric columns
print("\nFinal check for non-numeric columns:")
non_numeric = X.select_dtypes(exclude=['number']).columns
if len(non_numeric) > 0:
    print(f"Non-numeric columns found: {non_numeric}")
    X = X.drop(non_numeric, axis=1)
else:
    print("All columns are numeric - ready for XGBoost!")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
_, _, _, original_y_test = train_test_split(X, df_encoded['Original_Price'], test_size=0.2, random_state=42)

print("\nTraining XGBoost model...")
# Train XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_transformed = xgb_model.predict(X_test)

# Convert predictions back to original scale
from scipy.special import inv_boxcox
y_pred_original = inv_boxcox(y_pred_transformed, lambda_val)

# Model evaluation
print("\nModel Evaluation:")
print(f"Mean Absolute Error: ${mean_absolute_error(original_y_test, y_pred_original):.2f}")
print(f"Root Mean Squared Error: ${np.sqrt(mean_squared_error(original_y_test, y_pred_original)):.2f}")
print(f"R-squared: {r2_score(original_y_test, y_pred_original):.4f}")

# Feature importance
plt.figure(figsize=(12, 8))
feature_importance = xgb_model.feature_importances_
importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importance})
importance_df = importance_df.sort_values('importance', ascending=False).head(20)

plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()  # Display the most important feature at the top
plt.show()

# Visualize predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(original_y_test, y_pred_original, alpha=0.5)
plt.plot([min(original_y_test), max(original_y_test)], [min(original_y_test), max(original_y_test)], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Smartphone Prices')
plt.show()

In [None]:
# Apply to relevant columns (FIX: Use 'Price' instead of 'Price_Transformed')
outlier_cols = ['Price', 'RAM', 'storage', 'Battery_cap', 'primary_rear_camera', 'primary_front_camera']
for col in outlier_cols:
    df1 = cap_outliers(df1, col)

# Feature engineering
# Create RAM category
df1['RAM_category'] = pd.cut(df1['RAM'],
                             bins=[-float('inf'), np.log1p(4), np.log1p(8), np.log1p(12), float('inf')],
                             labels=['low', 'medium', 'high', 'very_high'])
df1['RAM_category'] = df1['RAM_category'].map({
    'low': 0, 'medium': 1, 'high': 2, 'very_high': 3
}).fillna(0).astype(int)

# Create camera quality feature
df1['camera_quality'] = df1['primary_rear_camera'] * df1['Num_Rear_Cameras'].astype(float)

# Create battery category (FIX: Extend bins to include high-capacity batteries)
df1['battery_category'] = pd.cut(df1['Battery_cap'],
                                bins=[-float('inf'), np.log1p(3500), np.log1p(4500), np.log1p(6000), float('inf')],
                                labels=['small', 'medium', 'big', 'very_big'])
df1['battery_category'] = df1['battery_category'].map({
    'small': 0, 'medium': 1, 'big': 2, 'very_big': 3
}).fillna(0).astype(int)

# Encode binary columns
binary_cols = ['has_fast_charging', 'has_fingerprints', 'has_nfc', 'has_5g']
for col in binary_cols:
    df1[col] = df1[col].map({'Yes': 1, 'No': 0})

# Remove 'Name' column early
df1 = df1.drop('Name', axis=1, errors='ignore')

# Rename transformed price for clarity
df1['Price_Transformed'] = df1['Price']
df1['Original_Price'] = original_price
df1 = df1.drop('Price', axis=1)

# Check data types before encoding
print("\nData types before encoding:")
print(df1.dtypes)

# Encode categorical columns
categorical_cols = ['brand_name', 'OS', 'processor_brand', 'display_types']
for col in categorical_cols:
    if col not in df1.columns:
        print(f"Warning: Column {col} not found in the dataframe")

df_encoded = pd.get_dummies(df1, columns=categorical_cols, drop_first=True)

# Check for any remaining object columns
print("\nData types after encoding:")
print(df_encoded.dtypes)
non_numeric = df_encoded.select_dtypes(exclude=['number']).columns
if len(non_numeric) > 0:
    print(f"Non-numeric columns found: {non_numeric}")
    df_encoded = df_encoded.drop(non_numeric, axis=1)

# Check for missing values
print("\nRemaining missing values:", df_encoded.isna().sum().sum())
df_encoded = df_encoded.dropna()  # Drop any remaining NaN rows

# Split data into features and target
X = df_encoded.drop(['Price_Transformed', 'Original_Price'], axis=1)
y = df_encoded['Price_Transformed']

# Final check for non-numeric columns
print("\nFinal check for non-numeric columns:")
non_numeric = X.select_dtypes(exclude=['number']).columns
if len(non_numeric) > 0:
    print(f"Non-numeric columns found: {non_numeric}")
    X = X.drop(non_numeric, axis=1)
else:
    print("All columns are numeric - ready for XGBoost!")

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
original_y_test = df_encoded['Original_Price'].loc[y_test.index]  # Use indices for alignment

print("\nTraining XGBoost model...")
# Train XGBoost model with tuned hyperparameters
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_train_transformed = xgb_model.predict(X_train)
y_pred_test_transformed = xgb_model.predict(X_test)

# Convert predictions back to original scale
y_pred_train_original = inv_boxcox(y_pred_train_transformed, lambda_val)
y_pred_test_original = inv_boxcox(y_pred_test_transformed, lambda_val)
y_train_original = inv_boxcox(y_train, lambda_val)

# Model evaluation
print("\nModel Evaluation:")
print("Training Set:")
print(f"MAE: ${mean_absolute_error(y_train_original, y_pred_train_original):.2f}")
print(f"RMSE: ${np.sqrt(mean_squared_error(y_train_original, y_pred_train_original)):.2f}")
print(f"R²: {r2_score(y_train_original, y_pred_train_original):.4f}")
print("Testing Set:")
print(f"MAE: ${mean_absolute_error(original_y_test, y_pred_test_original):.2f}")
print(f"RMSE: ${np.sqrt(mean_squared_error(original_y_test, y_pred_test_original)):.2f}")
print(f"R²: {r2_score(original_y_test, y_pred_test_original):.4f}")

# Cross-validation
cv_scores = cross_val_score(xgb_model, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f"\nCross-Validation RMSE (transformed scale): {cv_rmse.mean():.4f} (+/- {cv_rmse.std() * 2:.4f})")

# Feature importance
plt.figure(figsize=(12, 8))
feature_importance = xgb_model.feature_importances_
importance_df = pd.DataFrame({'feature': X.columns, 'importance': feature_importance})
importance_df = importance_df.sort_values('importance', ascending=False).head(20)
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()

# Visualize predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(original_y_test, y_pred_test_original, alpha=0.5)
plt.plot([min(original_y_test), max(original_y_test)], [min(original_y_test), max(original_y_test)], 'r--')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted Smartphone Prices')
plt.show()