In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('train.csv')

# Show the column names
print("Columns:", df.columns.tolist())

# Preview the first few rows of the dataframe
print(df.head())


Columns: ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range']
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  ...  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2  ...         20       756  2549     9     7         19   
1        136        3  ...        905      1988  2631    17     3          7   
2        145        5  ...       1263

In [2]:
# Fill missing values for numerical columns with their mean
df.fillna(df.mean(), inplace=True)

# For categorical columns (if any), fill missing values with the mode
categorical_columns = df.select_dtypes(include=['object']).columns
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)


In [4]:
from sklearn.preprocessing import StandardScaler

# Define your target variable
target_column = 'price_range'  # Replace with the actual target column

# Split the dataframe into X (features) and y (target)
X = df.drop(columns=[target_column])
y = df[target_column]

# Initialize StandardScaler
scaler = StandardScaler()

# Scale the features
X_scaled = scaler.fit_transform(X)


In [5]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Initialize the model
model = LinearRegression()

# Initialize KFold with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store the results of each fold
mse_list = []

# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate Mean Squared Error for this fold
    mse = mean_squared_error(y_test, y_pred)
    mse_list.append(mse)

# Display the average MSE across all folds
print(f"Average Mean Squared Error across all folds: {sum(mse_list) / len(mse_list)}")


Average Mean Squared Error across all folds: 0.10386824985599068


In [6]:
import joblib

# Save the model to a file
joblib.dump(model, 'regression_model_kfold.pkl')

# Save the scaler for future use (to scale new data the same way)
joblib.dump(scaler, 'scaler.pkl')

print("Model and Scaler saved as regression_model_kfold.pkl and scaler.pkl")


Model and Scaler saved as regression_model_kfold.pkl and scaler.pkl
