In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import re
#imputer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv('cleaned_data.csv')
data['User-Country'].describe()

In [None]:
data['User-ID'] = data['User-ID'].astype(str)
data['Genre'] = data['Genre'].astype(str)
data['Book-Rating'] = data['Book-Rating'].astype(float)
data['User-Age'] = data['User-Age'].astype(int)

# User Interaction Features
# Count of Books Read per Genre
genre_counts = data.pivot_table(index='User-ID', columns='Genre', values='ISBN', aggfunc='count', fill_value=0)

# Average Rating per Genre
average_ratings = data.pivot_table(index='User-ID', columns='Genre', values='Book-Rating', aggfunc='mean', fill_value=0)

# Variability in Ratings per Genre
std_dev_ratings = data.pivot_table(index='User-ID', columns='Genre', values='Book-Rating', aggfunc='std', fill_value=0)

# Combine these features into a single DataFrame
user_features = genre_counts.join(average_ratings, rsuffix='_avg').join(std_dev_ratings, rsuffix='_std')

#add the age column from the original data to user_features
user_features = user_features.join(data[['User-ID', 'User-Age']].set_index('User-ID'))

#add the User-Country column from the original data to user_features
user_features = user_features.join(data[['User-ID', 'User-Country']].set_index('User-ID'))

# Example output
user_features.head(100)

In [None]:
import pandas as pd

bins = [0, 18, 25, 35, 45, 55, 65, 100]
labels = ['<18', '18-25', '25-35', '35-45', '45-55', '55-65', '65+']
user_features['Age Group'] = pd.cut(user_features['User-Age'], bins=bins, labels=labels, right=False)
user_features.head(100)

#### Downmsampling Age Group

In [None]:
"""age_group_counts = user_features['Age Group'].value_counts()
print(age_group_counts)
# Calculate the target size, for example, the median size of the age groups
target_size = age_group_counts.median()

# Separate the overrepresented group from the rest
overrepresented = user_features[user_features['Age Group'] == '25-35']
other_groups = user_features[user_features['Age Group'] != '25-35']

# Downsample the overrepresented group
downsampled = overrepresented.sample(n=int(target_size), random_state=42)

balanced_user_features = pd.concat([downsampled, other_groups], ignore_index=True)
user_features = pd.concat([downsampled, other_groups], ignore_index=True)

age_group_counts = user_features['Age Group'].value_counts()
print(age_group_counts)"""

In [None]:
age_group_counts = user_features['Age Group'].value_counts()
print(age_group_counts)

In [None]:
from sklearn.impute import SimpleImputer

# Numeric columns: Impute with median
numeric_columns = user_features.select_dtypes(include=['int64', 'float64']).columns
numeric_imputer = SimpleImputer(strategy='median')
user_features[numeric_columns] = numeric_imputer.fit_transform(user_features[numeric_columns])

# Categorical columns: Impute with the most frequent value
categorical_columns = user_features.select_dtypes(include=['object']).columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
user_features[categorical_columns] = categorical_imputer.fit_transform(user_features[categorical_columns])

In [None]:
age_group_counts = user_features['Age Group'].value_counts()
print(age_group_counts)

In [None]:
# Plot the distribution of the age data after imputation
plt.figure(figsize=(10, 6))
user_features['User-Age'].hist(bins=20)
plt.title('Distribution of User Age After Median Imputation')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# One-hot encoding for country
country_dummies = pd.get_dummies(user_features['User-Country'], prefix='Country')
# One-hot encoding for age group
age_dummies = pd.get_dummies(user_features['Age Group'], prefix='Age')
#copy the original data to a new dataframe
user_features_visual = user_features.copy()
# Dropping the original columns
user_features.drop(['User-Country', 'Age Group'], axis=1, inplace=True)

# Adding these to the main DataFrame
user_features = pd.concat([user_features, country_dummies], axis=1)
user_features = pd.concat([user_features, age_dummies], axis=1)

user_features.head(100)

In [None]:
from sklearn.preprocessing import StandardScaler

# Select only numerical features for scaling
numerical_features = user_features.select_dtypes(include=['int64', 'float64'])
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)

# Combine scaled numerical features back with categorical
scaled_features = pd.concat([pd.DataFrame(scaled_numerical_features, columns=numerical_features.columns, index=numerical_features.index), user_features.select_dtypes(include=['uint8', 'category'])], axis=1)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
#apply decision Tree


# K-MEANS

In [None]:
from sklearn.cluster import KMeans
# Apply K-means clustering
kmeans = KMeans(n_clusters=7)  # Adjust the number of clusters as needed
clusters = kmeans.fit_predict(scaled_features)
user_features['Cluster'] = clusters
cluster_profiles = user_features.groupby('Cluster').mean()
print(cluster_profiles.head())

In [None]:
# Group by cluster and calculate mean for numerical features and mode for categorical features
cluster_summary = user_features.groupby('Cluster').agg({**{col: 'mean' for col in user_features.columns if user_features[col].dtype != 'object'},
                                                       **{col: lambda x: x.mode()[0] if not x.mode().empty else np.nan for col in user_features.columns if user_features[col].dtype == 'object'}})
cluster_summary

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

#extracting genre columns
genre_columns = [col for col in cluster_summary.columns]
genre_columns = genre_columns[0:9]

# Visualization of genre preferences per cluster
for genre in genre_columns:  # assuming genre_columns is a list of all your genre dummy columns
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Cluster', y=genre, data=user_features)
    plt.title(f'Preference for {genre} by Cluster')
    plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Heatmap of cluster centroids
plt.figure(figsize=(10, 8))
sns.heatmap(cluster_profiles, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Heatmap of Cluster Centroids')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

cluster_age_distribution = pd.crosstab(user_features['Cluster'], user_features_visual['Age Group'])

# Plotting the stacked bar chart
cluster_age_distribution.plot(kind='bar', stacked=True, figsize=(10, 7))
plt.title('Age Distribution by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Count')
plt.legend(title='Age Group')
plt.show()


In [None]:
from sklearn.metrics import silhouette_score

# Calculate Silhouette Score
score = silhouette_score(scaled_features, clusters)
print('Silhouette Score: %.2f' % score)