In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report, log_loss, roc_auc_score
import mlflow
import mlflow.sklearn
import joblib
import boto3
from botocore.exceptions import NoCredentialsError

In [2]:
np.random.seed(42)

In [3]:
# Define the file paths
books_path = r"D:\data\6th sem\Big data analytics\theory project\archive\data\Books.csv"
ratings_path = r"D:\data\6th sem\Big data analytics\theory project\archive\data\Ratings.csv"
users_path = r"D:\data\6th sem\Big data analytics\theory project\archive\Users.csv"

# Load the data into Pandas DataFrames
books_df = pd.read_csv(books_path, encoding='latin1')  # Use 'latin1' if there's special encoding
ratings_df = pd.read_csv(ratings_path, encoding='latin1')
users_df = pd.read_csv(users_path, encoding='latin1')

# Display the first few rows of each DataFrame to verify loading
print("Books Data:")
print(books_df.head())

print("\nRatings Data:")
print(ratings_df.head())

print("\nUsers Data:")
print(users_df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'D:\\data\\6th sem\\Big data analytics\\theory project\\archive\\Books.csv'

Step 1: Handle Missing Values

In [None]:
print("Books Missing Values:\n", books_df.isnull().sum())
print("\nRatings Missing Values:\n", ratings_df.isnull().sum())
print("\nUsers Missing Values:\n", users_df.isnull().sum())


In [None]:
# Convert Year-Of-Publication to numeric, handling invalid entries as NaN
books_df['Year-Of-Publication'] = pd.to_numeric(books_df['Year-Of-Publication'], errors='coerce')

# Fill missing values in Year-Of-Publication with the median year
median_year = books_df['Year-Of-Publication'].median()
books_df['Year-Of-Publication'] = books_df['Year-Of-Publication'].fillna(median_year)


In [None]:
# Calculate the median age
median_age = users_df['Age'].median()

# Fill missing values in the Age column
users_df['Age'] = users_df['Age'].fillna(median_age)


In [None]:
ratings_df.dropna(inplace=True)


Step 2: Feature Engineering

In [None]:
# Feature engineering
books_df['Decade'] = (books_df['Year-Of-Publication'] // 10) * 10
location_split = users_df['Location'].str.split(',', expand=True)
users_df['Country'] = location_split[2].str.strip().fillna('Unknown')
users_df.drop(columns=['Location'], inplace=True)

In [None]:
book_avg_rating = ratings_df.groupby('ISBN')['Book-Rating'].mean().reset_index(name='Avg_Book_Rating')
user_avg_rating = ratings_df.groupby('User-ID')['Book-Rating'].mean().reset_index(name='Avg_User_Rating')

ratings_with_details = ratings_df.merge(books_df[['ISBN', 'Book-Title', 'Book-Author', 'Decade']], on='ISBN', how='inner')\
                                .merge(users_df[['User-ID', 'Age', 'Country']], on='User-ID', how='inner')\
                                .merge(book_avg_rating, on='ISBN', how='inner')\
                                .merge(user_avg_rating, on='User-ID', how='inner')

ratings_with_details['High_Rating'] = (ratings_with_details['Book-Rating'] >= 7).astype(int)

Step 3: Normalization

In [None]:
book_avg_rating = ratings_df.groupby('ISBN')['Book-Rating'].mean().reset_index(name='Avg_Book_Rating')
user_avg_rating = ratings_df.groupby('User-ID')['Book-Rating'].mean().reset_index(name='Avg_User_Rating')

ratings_with_details = ratings_df.merge(books_df[['ISBN', 'Book-Title', 'Book-Author', 'Decade']], on='ISBN', how='inner')\
                                .merge(users_df[['User-ID', 'Age', 'Country']], on='User-ID', how='inner')\
                                .merge(book_avg_rating, on='ISBN', how='inner')\
                                .merge(user_avg_rating, on='User-ID', how='inner')

ratings_with_details['High_Rating'] = (ratings_with_details['Book-Rating'] >= 7).astype(int)

In [None]:
# Encode categorical features
label_encoder = LabelEncoder()
ratings_with_details['Country_Encoded'] = label_encoder.fit_transform(ratings_with_details['Country'])

In [None]:
# NLP: TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

books_df['Text'] = books_df['Book-Title'].fillna('') + ' ' + books_df['Book-Author'].fillna('')
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(books_df['Text'])

Step 4: Save Processed Data

In [None]:
# Save processed data


import joblib

ratings_with_details.to_csv('processed_ratings_with_details.csv', index=False)
np.save('tfidf_matrix.npy', tfidf_matrix.toarray())
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')

# Model Development

In [None]:
# Prepare data for training
feature_columns = ['Age', 'Avg_Book_Rating', 'Avg_User_Rating', 'Decade', 'Country_Encoded']
X = ratings_with_details[feature_columns]
y = ratings_with_details['High_Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# MLflow setup
import mlflow

mlflow.set_experiment('book_recommendation')

def train_and_log_model(model, model_name, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None
        accuracy = accuracy_score(y_test, y_pred)
        logloss = log_loss(y_test, y_proba) if y_proba is not None else None
        roc_auc = roc_auc_score(y_test, y_proba[:, 1]) if y_proba is not None else None
        mlflow.log_metric('accuracy', accuracy)
        if logloss is not None:
            mlflow.log_metric('log_loss', logloss)
        if roc_auc is not None:
            mlflow.log_metric('roc_auc', roc_auc)
        report = classification_report(y_test, y_pred, output_dict=True)
        mlflow.log_dict(report, 'classification_report.json')
        mlflow.sklearn.log_model(model, model_name)
        joblib.dump(model, f'{model_name}.joblib')
        return accuracy, logloss, roc_auc

In [None]:
# Train models
rf_model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42, n_jobs=-1)
xgb_model = XGBClassifier(n_estimators=50, max_depth=3, subsample=0.8, n_jobs=-1, eval_metric='logloss', random_state=42)
lr_model = LogisticRegression(max_iter=1000, random_state=42)

In [None]:
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score


results = []
results.append(train_and_log_model(rf_model, 'RandomForest', X_train, X_test, y_train, y_test))
results.append(train_and_log_model(xgb_model, 'XGBoost', X_train, X_test, y_train, y_test))
results.append(train_and_log_model(lr_model, 'LogisticRegression', X_train, X_test, y_train, y_test))

# Save results
results_df = pd.DataFrame(results, columns=['Accuracy', 'Log-Loss', 'ROC-AUC'],
                          index=['RandomForest', 'XGBoost', 'LogisticRegression'])
results_df.to_csv('model_comparison.csv')

In [None]:
# Recommendation function
from sklearn.metrics.pairwise import cosine_similarity

def recommend_book(user_id, book_title, model, tfidf, tfidf_matrix, books_df, ratings_with_details):
    # Collaborative filtering: Predict high rating probability
    user_data = ratings_with_details[ratings_with_details['User-ID'] == user_id][feature_columns].mean()
    if user_data.empty:
        user_data = ratings_with_details[feature_columns].mean()
    pred_proba = model.predict_proba([user_data])[0][1] if hasattr(model, 'predict_proba') else 0.5
    
    # Content-based filtering: Find similar books
    book_idx = books_df[books_df['Book-Title'].str.contains(book_title, case=False, na=False)].index
    if not book_idx.empty:
        book_idx = book_idx[0]
        similarities = cosine_similarity(tfidf_matrix[book_idx], tfidf_matrix).flatten()
        similar_indices = similarities.argsort()[-10:][::-1]
        similar_books = books_df.iloc[similar_indices]
        # Filter by popularity (high average rating)
        similar_books = similar_books.merge(book_avg_rating, on='ISBN', how='left')
        similar_books = similar_books[similar_books['Avg_Book_Rating'] >= 7]
        if not similar_books.empty:
            return similar_books.iloc[0][['Book-Title', 'Book-Author', 'Avg_Book_Rating']]
    
    # Default to most popular book if no match
    popular_book = ratings_with_details.merge(books_df[['ISBN', 'Book-Title', 'Book-Author']], on='ISBN')\
                                      .groupby(['ISBN', 'Book-Title', 'Book-Author'])['Book-Rating'].mean()\
                                      .reset_index().sort_values('Book-Rating', ascending=False).iloc[0]
    return popular_book[['Book-Title', 'Book-Author', 'Book-Rating']]

# Example recommendation
recommended_book = recommend_book(user_id=276725, book_title='Clara Callan', model=xgb_model,
                                 tfidf=tfidf, tfidf_matrix=tfidf_matrix, books_df=books_df,
                                 ratings_with_details=ratings_with_details)
print('Recommended Book:', recommended_book['Book-Title'], 'by', recommended_book['Book-Author'])

In [None]:
import boto3
from botocore.exceptions import NoCredentialsError, ClientError

def upload_to_s3(file_path, bucket_name, s3_path):
    try:
        s3 = boto3.client('s3')
        s3.upload_file(file_path, bucket_name, s3_path)
        print(f'Uploaded {file_path} to s3://{bucket_name}/{s3_path}')
    except NoCredentialsError:
        print(f'Failed to upload {file_path}: AWS credentials not configured.')
        print('Please configure AWS credentials using "aws configure" or set environment variables:')
        print('  export AWS_ACCESS_KEY_ID="your_access_key"')
        print('  export AWS_SECRET_ACCESS_KEY="your_secret_key"')
        print('  export AWS_DEFAULT_REGION="eu-north-1"')
    except ClientError as e:
        print(f'Failed to upload {file_path}: {e}')

bucket_name = 'book-recommendation-bucket123'
upload_to_s3('processed_ratings_with_details.csv', bucket_name, 'data/processed_ratings_with_details.csv')
upload_to_s3('RandomForest.joblib', bucket_name, 'models/RandomForest.joblib')
upload_to_s3('XGBoost.joblib', bucket_name, 'models/XGBoost.joblib')
upload_to_s3('LogisticRegression.joblib', bucket_name, 'models/LogisticRegression.joblib')
upload_to_s3('model_comparison.csv', bucket_name, 'results/model_comparison.csv')
upload_to_s3('tfidf_vectorizer.joblib', bucket_name, 'models/tfidf_vectorizer.joblib')
upload_to_s3('tfidf_matrix.npy', bucket_name, 'models/tfidf_matrix.npy')