In [None]:
from google.colab import drive
drive.mount('/content/Drive')

Mounted at /content/Drive


In [None]:
!pip install tensorflow

In [None]:
import pandas as pd

# Loading data from a CSV file
df = pd.read_csv('/content/Drive/MyDrive/NLP-LLM/books_1.Best_Books_Ever.csv')

print(df.info())

# Output of the first 5 rows of the table
print(df.head())

In [None]:
!pip install vaderSentiment

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split
from scipy.sparse import hstack
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_data(file_path):
    """Load data from a CSV file."""
    df = pd.read_csv(file_path)
    df['description'].fillna('', inplace=True)  # Ensuring no null values
    df['bookFormat'].fillna('Unknown', inplace=True)  # Fill missing formats with 'Unknown'
    return df

def add_sentiment_features(df):
    """Extract sentiment from descriptions using VADER."""
    analyzer = SentimentIntensityAnalyzer()
    df['sentiment'] = df['description'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
    return df

def preprocess_text_data(text_series, max_features=100):
    """Preprocess text data using TF-IDF vectorization with bi-grams."""
    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features, ngram_range=(1, 2))
    return vectorizer.fit_transform(text_series)

def encode_features(df):
    """Encode features including categorical and numerical data."""
    df = add_sentiment_features(df)
    tfidf_features = preprocess_text_data(df['description'])

    encoder = OneHotEncoder(handle_unknown='ignore')
    categorical_features = encoder.fit_transform(df[['bookFormat']])

    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    numerical_features = numerical_pipeline.fit_transform(df[['sentiment']])

    X = hstack([tfidf_features, categorical_features, numerical_features])
    return X, df['rating'].values

def train_model(X, y):
    """Train a RandomForest model."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_absolute_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)

    return model, X_test, y_test, y_pred, mse, rmse, mae

def train_collaborative_filtering(df):
    """Train an SVD model for collaborative filtering."""
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['userID', 'bookId', 'rating']], reader)
    trainset, testset = surprise_train_test_split(data, test_size=0.2)
    svd_model = SVD(n_factors=50, n_epochs=20, verbose=True)
    svd_model.fit(trainset)
    return svd_model, testset

def get_recommendations(df, svd_model, user_id, top_n=10):
    """Generate book recommendations for a given user."""
    unique_books = df['bookId'].unique()
    user_books = df[df['userID'] == user_id]['bookId'].unique()
    books_to_predict = np.setdiff1d(unique_books, user_books)

    predictions = [(book, svd_model.predict(user_id, book).est) for book in books_to_predict]
    recommendations = sorted(predictions, key=lambda x: x[1], reverse=True)[:top_n]
    recommended_books = pd.DataFrame(recommendations, columns=['bookId', 'Estimated Rating'])
    recommended_books = pd.merge(recommended_books, df[['bookId', 'title']], on='bookId').drop_duplicates('bookId')
    return recommended_books

def main(file_path, user_id):
    """Main function to run the recommendation system."""
    df = load_data(file_path)
    X, y = encode_features(df)
    item_model, X_test, y_test, y_pred, mse, rmse, mae = train_model(X, y)
    svd_model, testset = train_collaborative_filtering(df)

    # Get book recommendations
    recommendations = get_recommendations(df, svd_model, user_id)
    print("Top Recommended Books:")
    print(recommendations)

    # Evaluate model
    print("\nEvaluation Metrics:")
    print(f"RMSE (Random Forest): {rmse}")
    print(f"MAE (Random Forest): {mae}")
    run_kfold_cross_validation(df)

if __name__ == "__main__":
    file_path = input("Enter the path to your dataset: ")
    user_id = int(input("Enter User ID: "))
    main(file_path, user_id)


Enter the path to your dataset: /content/Drive/MyDrive/NLP-LLM/books_1.Best_Books_Ever.csv
Enter User ID: 4728
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Top Recommended Books:
                                   bookId  Estimated Rating  \
0                     15852670-ode-to-uke          4.313895   
1                   5206311-rovella-starr          4.298820   
2                   220741.First_and_Only          4.290042   
3                      465904.Dragonsdawn          4.287323   
4  31700489-el-laberinto-de-los-esp-ritus          4.284133   
5               2823.The_Birth_of_Tragedy          4.280306   
6         4506536-twenty-years-in-s