In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder


# Step 1: Data Preprocessing
app_data = pd.read_csv('/googleplaystore.csv')
review_data = pd.read_csv('/googleplaystore_user_reviews.csv')



In [None]:
# Perform necessary data cleaning and preprocessing steps for app_data
app_data.dropna(inplace=True)  # Drop rows with missing values
app_data = app_data[app_data['Rating'] <= 5]  # Remove outliers (ratings > 5)

# Convert categorical variables into numerical representation for app_data
category_encoder = LabelEncoder()
app_data['Category'] = category_encoder.fit_transform(app_data['Category'])

In [None]:
# Select relevant features from app_data
app_features = ['Category', 'Size', 'Installs', 'Reviews', 'Price']
app_target = 'Rating'
app_details = 'App'
app_data = app_data[[app_details] + app_features + [app_target]]

In [None]:
# Perform necessary data cleaning and preprocessing steps for review_data
review_data.dropna(inplace=True)  # Drop rows with missing values

# Merge app_data and review_data on the "App" column
data = pd.merge(app_data, review_data, on='App', how='inner')



In [None]:
# Cleaning the the features
def clean_size(size):
    if size.endswith('M'):
        return float(size[:-1]) * 1024 * 1024
    elif size.endswith('k'):
        return float(size[:-1]) * 1024
    else:
        return float(size)
data['Size'] = data['Size'].apply(clean_size)
def clean_installs(installs):
    return int(installs.replace(',', '').replace('+', ''))
data['Installs'] = data['Installs'].apply(clean_installs)
def clean_price(price):
    return float(price.strip('$').strip())
data['Price'] = data['Price'].apply(clean_price)




In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[app_features], data[app_target], test_size=0.2, random_state=42)

# Step 3: Model Selection
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Step 4: Train the Model
model.fit(X_train, y_train)

# Step 5: Evaluate the Model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = model.score(X_test, y_test)
print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)

Mean Squared Error: 0.00015013483533140394
Mean Absolute Error: 0.00026533002618981766
R-squared: 0.9980579275236666
