In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import MultiLabelBinarizer

pd.options.mode.chained_assignment = None

In [2]:
df = pd.read_csv(r"D:\asu\Semester 6 - Spring 23\CSE472 - Artificial Intelligence\another version of Der3 version\App Rating Competition\path\train.csv")
df.rename(columns={
    'X0': 'AppName',
    'X1': 'Category',
    'X2': 'NumReviews',
    'X3': 'AppSize',
    'X4': 'NumInstalls',
    'X5': 'IsFree',
    'X6': 'Price',
    'X7': 'AgeCategory',
    'X8': 'Genres',
    'X9': 'LastUpdate',
    'X10': 'Version',
    'X11': 'MinAndroidVer',
    'Y': 'Rating'
}, inplace=True)

# Drop AppName
df.drop(columns=['AppName'], inplace=True)

## Category Column

In [3]:
# Step 1: Remove the erroneous '1.9' category row
df = df[df['Category'] != '1.9']

# Step 2: Group smaller categories to reduce imbalance
# Categories with fewer than 50 entries get grouped into "OTHER"
category_counts = df['Category'].value_counts()
small_categories = category_counts[category_counts < 50].index.tolist()
df['Category_Grouped'] = df['Category'].apply(
    lambda x: 'OTHER' if x in small_categories else x)

# Step 3: One-hot encode the grouped categories
# This creates binary columns for each category
category_dummies = pd.get_dummies(df['Category_Grouped'], prefix='Cat')

# Step 4: Join the new dummy columns to the original dataframe
df = pd.concat([df, category_dummies], axis=1)

# Step 5: Drop the original Category column to avoid redundancy
# Keep Category_Grouped for reference if needed
df.drop(columns=['Category', 'Category_Grouped'], inplace=True)

## Fixing NumReviews

In [4]:
df["NumReviews"] = pd.to_numeric(df["NumReviews"], errors='raise')

## Fixing AppSize & numinstalls

In [5]:
def convert_to_mb(size_str):
    if isinstance(size_str, str):
        if 'k' in size_str.lower():
            # Convert kilobytes to megabytes (divide by 1024)
            return float(size_str.lower().replace('k', '').strip()) / 1024
        elif 'm' in size_str.lower():
            # Convert megabytes to megabytes (already in the correct unit)
            return float(size_str.lower().replace('m', '').strip())
        elif 'varies with device' in size_str.lower():
            return np.nan
        else:
            # Already in megabytes
            return float(size_str)
    return size_str


# Apply the conversion function
df["AppSize"] = df["AppSize"].apply(convert_to_mb)

# Convert to numeric, forcing errors to NaN
df["AppSize"] = pd.to_numeric(df["AppSize"], errors='coerce')

# Fill NaN values with median of the column
app_size_median = df["AppSize"].median()  # We can modify this later

df["AppSize"].fillna(app_size_median, inplace=True)

entries_where_end_is_plus = df["NumInstalls"].map(lambda x: x.endswith('+'))

# Remove the '+' sign and convert to numeric
df["NumInstalls"] = df["NumInstalls"].map(
    lambda x: x[:-1].replace(',', '') if x.endswith('+') else x)
df["NumInstalls"] = pd.to_numeric(df["NumInstalls"], errors='raise')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["AppSize"].fillna(app_size_median, inplace=True)


## Fixing Prices

In [6]:
# Replace instances of "Free" with 0 and "Paid" with 1
df["IsFree"] = df["IsFree"].map(lambda x: 0 if x == "Free" else 1)
# Remove dollar sign before converting to numeric
df["Price"] = df["Price"].replace({'\$': ''}, regex=True)
df["Price"] = pd.to_numeric(df["Price"], errors='raise')

  df["Price"] = df["Price"].replace({'\$': ''}, regex=True)


## Fixing Age Category

In [7]:
age_category_dummies = pd.get_dummies(df['AgeCategory'], prefix='Age')

# Add the one-hot encoded columns to the dataframe
df = pd.concat([df, age_category_dummies], axis=1)

# Optionally, drop the original AgeCategory column if you don't need it anymore
df.drop('AgeCategory', axis=1, inplace=True)

## Fixing Genres, Year, Version and MinAndroidVersion

In [8]:
df['Genre_split'] = df['Genres'].str.split(';')

# Step 2: Use MultiLabelBinarizer to create binary columns
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(
    mlb.fit_transform(df['Genre_split']),
    columns=mlb.classes_,
    index=df.index
)

# Step 3: Combine the original DataFrame with the encoded genres
df_encoded = pd.concat([df, genre_encoded], axis=1)

# If you want to drop the original and intermediate columns
df_encoded = df_encoded.drop(['Genres', 'Genre_split'], axis=1)

df = df_encoded

In [9]:
# df["Year"] = df["LastUpdate"]
# Dates are in mixed formats, so use format='mixed'
df["LastUpdate"] = pd.to_datetime(df["LastUpdate"], format='mixed')

# Extract the year into a new column
df["Year"] = df["LastUpdate"].dt.year

df.drop(columns=["LastUpdate"], inplace=True)

In [10]:
# Drop Version and MinAndroidVer columns
df = df.drop(['Version', 'MinAndroidVer'], axis=1)

In [11]:
# Convert boolean columns to int (0/1)
bool_columns = df.select_dtypes(include=['bool']).columns
for col in bool_columns:
    df[col] = df[col].astype(int)

# No need to convert int64 columns as they are already numeric

# Make sure all float columns are properly formatted
float_columns = df.select_dtypes(include=['float64']).columns
for col in float_columns:
    # Handle any potential non-numeric values
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Make sure Year is properly formatted as numeric
df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('int32')

# Handle missing values in Rating (our target variable)
# Since Rating is what we're trying to predict, we'll drop rows with missing Rating values
df_for_ml = df.dropna(subset=['Rating'])

# Assert no NaNs
assert df_for_ml.isnull().sum().sum() == 0, "There are still NaN values in the DataFrame."

# Assert no text columns
assert df_for_ml.select_dtypes(include=['object']).empty, "There are still text columns in the DataFrame."

# Assert no infinite values
assert not np.isinf(df_for_ml).any().any(), "There are infinite values in the DataFrame."

# Save data after cleaning

In [12]:
import os

# Save the cleaned DataFrame to a CSV file
cleaned_file = r"D:\asu\Semester 6 - Spring 23\CSE472 - Artificial Intelligence\another version of Der3 version\App Rating Competition\cleaned_train.csv"

# Ensure the directory exists
os.makedirs(os.path.dirname(cleaned_file), exist_ok=True)

df_for_ml.to_csv(cleaned_file, index=False)
print(f"Cleaned DataFrame saved to {cleaned_file}")

Cleaned DataFrame saved to D:\asu\Semester 6 - Spring 23\CSE472 - Artificial Intelligence\another version of Der3 version\App Rating Competition\cleaned_train.csv


# Normailzation of number of installs and number of reviews


In [13]:
df['NumInstalls'] = np.log1p(df['NumInstalls'])
df['NumReviews'] = np.log1p(df['NumReviews']) 
# add those to the new cleaned dataframe 
df_for_ml = df.dropna(subset=['Rating'])
normalized_file = r"D:\asu\Semester 6 - Spring 23\CSE472 - Artificial Intelligence\another version of Der3 version\App Rating Competition/normalized_train.csv"
df_for_ml.to_csv(normalized_file, index=False)
print(f"Normalized DataFrame saved to {normalized_file}")




Normalized DataFrame saved to D:\asu\Semester 6 - Spring 23\CSE472 - Artificial Intelligence\another version of Der3 version\App Rating Competition/normalized_train.csv


# Splitting the data into train , validation and testing

In [14]:
import os
#split the data into 70% training and 15% validation and 15% testing
from sklearn.model_selection import train_test_split

# First split: 70% train, 30% temp
train_df, temp_df = train_test_split(df_for_ml, test_size=0.40, random_state=42)

# Second split: 15% val, 15% test from the 30% temp
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42)

# Ensure the output directory exists
output_dir = r"D:\asu\Semester 6 - Spring 23\CSE472 - Artificial Intelligence\another version of Der3 version\App Rating Competition\after_split"
os.makedirs(output_dir, exist_ok=True)

# Save the splits to CSV files
train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "test.csv"), index=False)
print("Data splits saved to CSV files.")



Data splits saved to CSV files.
