In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error,r2_score

In [None]:
# Read the data
data = pd.read_csv('data/Cellphone.csv')
data.head()

In [None]:
## Cleaning the data

# Check for missing values
data.isnull().sum()

In [None]:
# Check for duplicated values
data[data.duplicated()]

In [21]:
# Remove 'Product_id' column since it's not needed
data.drop('Product_id', inplace=True, axis=1)

# Seperating out the independent and dependent variables
X = data.drop("Price", axis = 1)
y = data["Price"]


In [None]:
# Train Test Split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

# Using StandardScaler to standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Fit and Predict
model = LinearRegression()
model.fit(X_train,y_train)
predictions = model.predict(X_test)

# EVALUATION METRICS
print(f"r2_score is: {r2_score(y_test,predictions)}")
print(f"RMSE is: {np.sqrt(mean_squared_error(y_test,predictions))}")