<a href="https://colab.research.google.com/github/SKAZEXE/Zaidi/blob/main/Imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
df = pd.read_csv('//content/train_data.txt', sep=' ::: ', engine='python', names=['Title', 'Genre', 'Description'], nrows=6000)

df.head()

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...


In [12]:
#Preprocessing
df['Description'] = df['Description'].str.lower()
df['Title'] = df['Title'].str.lower()
df['Genre'] = df['Genre'].str.lower()
df.head()

Unnamed: 0,Title,Genre,Description
1,oscar et la dame rose (2009),drama,listening in to a conversation between his doc...
2,cupid (1997),thriller,a brother and sister with a past incestuous re...
3,"young, wild and wonderful (1980)",adult,as the bus empties the students for their fiel...
4,the secret sin (1915),drama,to help their unemployed father make ends meet...
5,the unrecovered (2007),drama,the film's title refers not only to the un-rec...


In [13]:
#COunt
df['Genre'].value_counts()


drama          1481
documentary    1459
comedy          830
short           559
horror          235
thriller        211
action          146
western         116
reality-tv      100
family           82
adventure        82
sci-fi           79
music            75
adult            73
romance          58
animation        54
sport            51
crime            48
talk-show        41
mystery          37
biography        36
fantasy          35
musical          29
history          27
game-show        25
news             18
war              13
Name: Genre, dtype: int64

In [14]:
df = df[
    (df['Genre'] == 'drama') | (df['Genre'] == 'music') | (df['Genre'] == 'documentary') | (df['Genre'] == 'western')]
df

Unnamed: 0,Title,Genre,Description
1,oscar et la dame rose (2009),drama,listening in to a conversation between his doc...
4,the secret sin (1915),drama,to help their unemployed father make ends meet...
5,the unrecovered (2007),drama,the film's title refers not only to the un-rec...
6,quality control (2011),documentary,quality control consists of a series of 16mm s...
11,the spirit world: ghana (2016),documentary,tom beacham explores ghana with director of ph...
...,...,...,...
5990,into abyssinia (2009),documentary,"carol foster, a mother of seven adopted childr..."
5993,con el tango en el corazón (tres minutos) (2010),documentary,a narrator introduces us to the enchanted worl...
5997,naufrage dans l'ungava (2016),documentary,"ť after the euphoria of being, at last, in the..."
5998,nascar: unauthorized (2006),documentary,nascar: unauthorized takes you out of the blea...


In [15]:
vec = TfidfVectorizer(stop_words='english')

In [16]:
matrix = vec.fit_transform(df['Description'])

In [17]:
X = matrix.toarray()

In [18]:
vec.get_feature_names_out()


array(['00', '000', '000km', ..., 'żestán', 'żo', 'żte'], dtype=object)

In [19]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)


In [20]:
fig = px.scatter(x=X_pca[:, 0], y=X_pca[:, 1], color=df['Genre'], hover_name=df['Title'], template='plotly_dark')
fig.update_layout(
    title="2 Component PCA visualization of Movie Genres",
    xaxis_title="1st Principal Component",
    yaxis_title="2nd Principal Component",
)
fig.show()

In [21]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)
fig = px.scatter_3d(x=X_pca[:, 0], y=X_pca[:, 1], z=X_pca[:, 2], color=df['Genre'], opacity=0.8,
                    title="3 Component PCA visualization of Movie Genres", hover_name=df['Title'], template='plotly_dark')
fig.show()

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import numpy as np

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Genre'], test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Get unique classes from the true labels
unique_classes = np.unique(y_test)

# Initialize a classification report with zero_division=1 for each unique class
class_report = classification_report(y_test, y_pred, labels=unique_classes, zero_division=1)

# Print the classification report
print(class_report)


              precision    recall  f1-score   support

 documentary       0.85      0.90      0.87       304
       drama       0.81      0.86      0.84       285
       music       1.00      0.00      0.00        12
     western       1.00      0.04      0.07        26

    accuracy                           0.83       627
   macro avg       0.92      0.45      0.45       627
weighted avg       0.84      0.83      0.81       627



In [23]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between descriptions
similarity_matrix = cosine_similarity(X)

# Change this to the index of the movie/TV show you want to find similar content for
movie_index = 1  # Set it to an index that exists in your filtered dataset

# Get the top 4 most similar items
similar_indices = similarity_matrix[movie_index].argsort()[:-5:-1]

# Retrieve the titles and similarity scores for the most similar items
similar_movies = [(df['Title'].iloc[i], similarity_matrix[movie_index][i]) for i in similar_indices]
print(similar_movies)


[('the secret sin (1915)', 1.0), ('"the widow and the viper" (2016)', 0.22777958941134968), ('sie ist meine mutter (2006)', 0.2253356150354337), ('north of arizona (1935)', 0.176325253953484)]


In [24]:
# Extract keywords from descriptions using TF-IDF
tfidf_scores = vec.idf_  # Get the inverse document frequency (IDF) scores for each term
feature_names = vec.get_feature_names_out()

# Combine TF-IDF scores and feature names into a list of tuples
tfidf_tuples = list(zip(feature_names, tfidf_scores))

# Sort the list based on IDF scores in descending order to get top keywords
tfidf_tuples.sort(key=lambda x: x[1], reverse=True)

# Get the top 10 keywords
top_keywords = tfidf_tuples[:10]
print(top_keywords)


[('000km', 8.356279876550747), ('02', 8.356279876550747), ('03', 8.356279876550747), ('04', 8.356279876550747), ('070', 8.356279876550747), ('09', 8.356279876550747), ('104', 8.356279876550747), ('1080p', 8.356279876550747), ('10am', 8.356279876550747), ('10ms', 8.356279876550747)]


In [25]:
# Tag content based on genre predictions
df['Predicted_Genre'] = classifier.predict(X)

# Tag content based on keywords
df['Top_Keywords'] = df['Description'].apply(lambda desc: [word for word, _ in top_keywords if word in desc])

# Example: Display a sample of tagged content
print(df[['Title', 'Genre', 'Predicted_Genre', 'Top_Keywords']].head())


                             Title        Genre Predicted_Genre Top_Keywords
1     oscar et la dame rose (2009)        drama           drama           []
4            the secret sin (1915)        drama           drama           []
5           the unrecovered (2007)        drama     documentary           []
6           quality control (2011)  documentary     documentary           []
11  the spirit world: ghana (2016)  documentary     documentary           []


In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import gensim


# Preprocess the descriptions (if not already done)
df['Description'] = df['Description'].str.lower()

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the descriptions into TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Description'])

# Create an LDA model
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)  # Adjust the number of topics as needed

# Fit the LDA model on the TF-IDF vectors
lda_model.fit(tfidf_matrix)

# Get the top words for each topic
top_words_per_topic = []
for topic in lda_model.components_:
    top_words = [tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]]
    top_words_per_topic.append(top_words)

# Print the top words for each topic
for i, top_words in enumerate(top_words_per_topic):
    print(f"Topic {i + 1}: {', '.join(top_words)}")

# Assign topics to each description
topic_assignments = lda_model.transform(tfidf_matrix)

# Add topic assignments to the DataFrame
df['Topic'] = topic_assignments.argmax(axis=1)

# Display a sample of the DataFrame with topic assignments
print(df[['Title', 'Description', 'Topic']])


Topic 1: husband, relationships, adam, emily, billy, jack, pierre, guilt, wife, mia
Topic 2: red, music, zealand, adele, realizes, nina, ballet, bud, contest, rahul
Topic 3: monique, canyon, palestinian, underground, edith, lorenzo, border, soldiers, circus, miami
Topic 4: title, iraq, door, tale, idea, forests, pop, proud, schools, evelyn
Topic 5: anand, zealand, crosby, hayes, japanese, steps, stars, josh, la, et
Topic 6: ted, imprisoned, coal, sit, spanish, pat, cuba, tale, mexico, feels
Topic 7: sara, nathan, doctors, sonora, kim, trial, holds, family, throne, ralph
Topic 8: people, documentary, years, family, young, world, new, story, film, life
Topic 9: tim, alaska, roof, critical, joe, shared, graffiti, wildlife, salmon, hit
Topic 10: charm, man, indigenous, tom, desires, wanted, tale, material, morrison, adam
                                                 Title  \
1                         oscar et la dame rose (2009)   
4                                the secret sin (1915) 

In [27]:
# Print out the list of movie titles from your dataset
print(df['Title'])



1                           oscar et la dame rose (2009)
4                                  the secret sin (1915)
5                                 the unrecovered (2007)
6                                 quality control (2011)
11                        the spirit world: ghana (2016)
                              ...                       
5990                               into abyssinia (2009)
5993    con el tango en el corazón (tres minutos) (2010)
5997                       naufrage dans l'ungava (2016)
5998                         nascar: unauthorized (2006)
6000                              "summer rental" (2011)
Name: Title, Length: 3131, dtype: object


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the descriptions into TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Description'])

# Compute the cosine similarity between items based on their descriptions
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a mapping of titles to their indices
title_to_index = pd.Series(df.index, index=df['Title'])

# Function to get movie predictions based on user input
def get_predictions(movie_title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = title_to_index[movie_title]

    # Get the pairwise similarity scores for all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar movies (excluding the input movie)
    sim_scores = sim_scores[1:11]

    # Get the movie indices and titles
    movie_indices = [i[0] for i in sim_scores]
    similar_movies = df['Title'].iloc[movie_indices]

    # Return the titles of the top 10 similar movies
    return similar_movies

# Example: Get predictions based on the movie "the unrecovered"
movie_title = "the unrecovered"  # Replace with the actual movie title
predictions = get_predictions(movie_title)

# Print the predicted movies with titles
for i, movie in enumerate(predictions, 1):
    print(f"{i}. {movie}")


KeyError: ignored

In [None]:
#Count the number of movies which are starting from the letter A
A = df[df['Title'].str.startswith('a')]
count = A['Title'].count()
print(count)
A

In [None]:
import numpy as np
from google.colab import autoviz

def categorical_histogram(df, colname, figscale=1, mpl_palette_name='Dark2'):
  from matplotlib import pyplot as plt
  import seaborn as sns
  df.groupby(colname).size().plot(kind='barh', color=sns.palettes.mpl_palette(mpl_palette_name), figsize=(8*figscale, 4.8*figscale))
  plt.gca().spines[['top', 'right',]].set_visible(False)
  return autoviz.MplChart.from_current_mpl_state()

chart = categorical_histogram(A, *['Genre'], **{})
chart

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Encode the 'Genre' column
dff['Genre_encoded'] = label_encoder.fit_transform(dff['Genre'])

# Display the updated DataFrame
dff.Genre_encoded

# New section

In [None]:
# Drop rows with missing 'Description' values
dff.dropna(subset=['Description'], inplace=True)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust the number of features as needed

# Fit and transform the plot summary text
tfidf_matrix = tfidf_vectorizer.fit_transform(dff['Description'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the TF-IDF features with your DataFrame
dff = pd.concat([dff, tfidf_df], axis=1)

In [None]:
# Check for missing values in the 'Description' column
missing_description = dff['Description'].isna()

# Fill missing descriptions with empty strings or any other appropriate placeholder
dff['Description'].fillna('', inplace=True)

# Define a list of keywords to search for
keywords = ["action", "comedy", "romance"]

# Create binary columns for each keyword
for keyword in keywords:
    dff[keyword + "_keyword"] = dff['Description'].str.contains(keyword).astype(int)

# Reset NaN values in 'Description' back to NaN if needed
import numpy as np
dff.loc[missing_description, 'Description'] = np.nan
dff.Description

In [None]:
from textblob import TextBlob

# Function to calculate sentiment polarity
def get_sentiment(text):
    # Check for NaN values and return neutral sentiment if NaN
    if isinstance(text, str):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity
    else:
        return 0  # You can choose to return a different default value for NaN

# Apply the sentiment analysis function to the plot summary and create a new column
dff['Sentiment'] = dff['Description'].apply(get_sentiment)


In [None]:
# Calculate the number of words in the plot summary, handling NaN values
dff['Plot_Summary_Length'] = dff['Description'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

# Alternatively, you can calculate the number of characters, handling NaN values
dff['Plot_Summary_Length'] = dff['Description'].apply(lambda x: len(x) if isinstance(x, str) else 0)


In [None]:
# Calculate the total word count for each plot summary, handling NaN values
dff['Total_Words'] = dff['Description'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

# Calculate the average word length, handling empty descriptions and NaN values
dff['Average_Word_Length'] = dff['Description'].apply(
    lambda x: np.mean([len(word) for word in x.split()]) if isinstance(x, str) and len(x.split()) > 0 else 0
)

# Access the 'Average_Word_Length' column
average_word_lengths = dff['Average_Word_Length']
average_word_lengths

In [None]:
# Remove rows with NaN values in the 'Description' column
dff.dropna(subset=['Description'], inplace=True)

# Create a CountVectorizer for bigrams (N=2)
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_matrix = bigram_vectorizer.fit_transform(dff['Description'])

# Create a DataFrame from the bigram matrix
bigram_df = pd.DataFrame(bigram_matrix.toarray(), columns=bigram_vectorizer.get_feature_names_out())

# Concatenate the bigram features with your DataFrame
dff = pd.concat([dff, bigram_df], axis=1)
