<a href="https://colab.research.google.com/github/Talha-coder-01/Natural_Language_Processing/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import csv

def load_problematic_csv(file_path, encoding='utf-8'):
    rows = []
    with open(file_path, 'r', encoding=encoding) as f:
        csv_reader = csv.reader(f, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True)
        for row in csv_reader:
            if row:  # Check if the row is not empty
                # Join all fields into a single string, then split by comma
                # This handles cases where commas are inside quoted strings
                full_row = ','.join(row).split(',')
                if len(full_row) >= 2:  # Ensure we have at least 2 columns
                    rows.append([full_row[0], ','.join(full_row[1:])])
                else:
                    print(f"Skipping malformed row: {full_row}")

    return pd.DataFrame(rows, columns=['sentiment', 'review'])

# Load the dataset
try:
    df = pd.read_csv('/content/IMDB Dataset.csv')
except pd.errors.ParserError:
    print("Error encountered with standard pd.read_csv(). Attempting robust loading method.")
    df = load_problematic_csv('your_imdb_dataset.csv')

# Print the first few rows and dataset info to verify successful loading
print(df.head())
print(df.info())

# Rest of your code remains the same
# Data Preparation
df['review'] = df['review'].fillna('')
df['review'] = df['review'].str.lower()


# Sentiment Analysis
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the model
print("Sentiment Analysis Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Create a TF-IDF matrix for all reviews
tfidf_matrix = tfidf.fit_transform(df['review'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get movie recommendations
def get_recommendations(review_index, cosine_sim, df, top_n=10):
    sim_scores = list(enumerate(cosine_sim[review_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Top N similar reviews
    review_indices = [i[0] for i in sim_scores]

    # Get the recommended reviews and their sentiments
    recommended_reviews = df['review'].iloc[review_indices]
    recommended_sentiments = df['sentiment'].iloc[review_indices]

    return pd.DataFrame({
        'review': recommended_reviews,
        'sentiment': recommended_sentiments,
        'similarity_score': [score for _, score in sim_scores]
    })

# Example usage
review_index = 0  # You can change this to any index in your dataset
recommendations = get_recommendations(review_index, cosine_sim, df)
print(f"\nRecommendations based on review at index {review_index}:")
print(recommendations)

# Function to get overall sentiment of recommendations
def get_recommendation_sentiment(recommendations):
    return recommendations['sentiment'].mode().iloc[0]

overall_sentiment = get_recommendation_sentiment(recommendations)
print(f"\nOverall sentiment of recommendations: {overall_sentiment}")

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
Sentiment Analysis Model Performance:
Accuracy: 0.8517

Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      4961
    positive       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg  