In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor


In [None]:
dataset = pd.read_csv("/content/FinalData.csv.csv")
dataset = dataset.dropna(how='any')
dataset.drop(['Network Type', 'Full HD Recording', 'Audio Jack', 'Primary Camera', 'Resolution', 'Operating System', 'Domestic Warranty', 'Secondary Camera'], axis=1, inplace=True)
# Extract input features (x) and target variable (y)
x = dataset.iloc[:, 6:].values
y = dataset.iloc[:, 2].values
#Print a sample of x and y
print("Input features (x):")
print(x[:5])
print("Target variable (y):")
print(y[:5])

In [None]:
# Converting data of the price column of the dataset into integer
y = np.array([int(''.join(filter(str.isdigit, str(val)))) for val in y])
y = y.reshape(-1, 1)

# Converting the primary clock speed, RAM, and storage into integers
pcs = x[:, 3]
pcs = np.array([eval(str(val).replace('GHz', '*1000').replace('MHz', '')) for val in pcs])

# Converting the display size of the dataset into integer value
display = x[:, 1]
display = np.array([int(float(str(val).split(" ")[0])) for val in display])

# Printing the converted values
print("Converted Price:")
print(y)
print("Converted Primary Clock Speed:")
print(pcs)
print("Converted Display Size:")
print(display)

In [None]:
# Converting the values of the RAM and the storage into integer value
storage = x[:, 4]
storage = np.array([eval(str(val).replace('GB', '*1024').replace('MB', '')) for val in storage])

ram = x[:, 5]
ram = np.array([eval(str(val).replace('GB', '*1024').replace('MB', '')) for val in ram])

# Converting the battery capacity and the weight column of the dataset into integer
bat_cap = x[:, -7]
bat_cap = np.array([int(str(val).replace('mAh', '')) for val in bat_cap])

weight = x[:, -6]
weight = np.array([float(str(val).replace('g', '')) for val in weight])

# Printing the converted values
print("Converted Storage:")
print(storage)
print("Converted RAM:")
print(ram)
print("Converted Battery Capacity:")
print(bat_cap)
print("Converted Weight:")
print(weight)

In [None]:
# Encoding of the categorical data in the matrix of features
df = pd.DataFrame(data=x)
categorical_cols = [0, 2, 6, 7, 8, 9, 10]
df[categorical_cols] = df[categorical_cols].apply(LabelEncoder().fit_transform)
x = df.iloc[:, :].values

# Dividing the data into training and test dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=3, test_size=0.2)
print(x_train)

# Applying feature scaling to the matrix of features (x_train and x_test)
sc_feature_mat = StandardScaler()
x_train = sc_feature_mat.fit_transform(x_train)
x_test = sc_feature_mat.transform(x_test)

# Applying feature scaling to the dependent variable vector (y_train and y_test)
sc_dep_vec = StandardScaler()
y_train = sc_dep_vec.fit_transform(y_train)
y_test = sc_dep_vec.transform(y_test)

# Taking the values till 5 decimal places
np.set_printoptions(suppress=True, precision=5)

# Printing the transformed data
print("Transformed x_train:")
print(x_train)
print("Transformed x_test:")
print(x_test)
print("Transformed y_train:")
print(y_train)
print("Transformed y_test:")
print(y_test)

In [None]:


regressor = RandomForestRegressor(n_estimators=100, random_state=3)

# Train the model
regressor.fit(x_train, y_train)

# Make predictions on the test set
y_pred = regressor.predict(x_test)

# Calculate the R2 score
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)


plt.scatter(y_pred, y_test, color='red')
plt.plot(y_test, y_test, color='blue')
plt.title("Predicted vs. Original Values")
plt.xlabel("Predicted value of price")
plt.ylabel("Original price of the model")
plt.show()


avgPrice = dataset.groupby('Model Name')['Price'].mean()
plt.figure(figsize=(16, 10))
avgPrice.plot(kind='bar')
plt.xlabel('Name of the Model')
plt.ylabel('Average Price of each Model')
plt.title('Graph of Average Price of each Model')
plt.show()