# 3.b Synthetic Data Generation

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import nltk
from nltk import NaiveBayesClassifier, FreqDist
from nltk.tokenize import word_tokenize

In [3]:
df = pd.read_csv("IMDB Dataset.csv")

In [4]:
df.head

<bound method NDFrame.head of                                                   review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]>

## Descriptive statistics of the review

In [5]:
def no_of_words(text):
    words=text.split()
    word_count=len(words)
    return word_count

In [6]:
wc=df['review'].apply(no_of_words)

In [7]:
wc.describe()

count    50000.000000
mean       231.156940
std        171.343997
min          4.000000
25%        126.000000
50%        173.000000
75%        280.000000
max       2470.000000
Name: review, dtype: float64

## Synthetic Data Generating Model

In [11]:
# Function to extract n-gram features from a document
def document_features(document, ngram_features):
    document_ngrams = list(nltk.ngrams(document, 2))
    features = {}
    for ngram in ngram_features:
        features['contains({})'.format(ngram)] = (ngram in document_ngrams)
    return features

# Train a Naive Bayes classifier with n-gram features
def train_naive_bayes_classifier(df, custom_vocab, n):
    # Shuffle the DataFrame
    df = df.sample(frac=1).reset_index(drop=True)

    # Select a subset of the DataFrame for demonstration purposes
    df_subset = df.head(3000)

    # Tokenize the reviews into words
    all_words = [word.lower() for review in df_subset['review'] for word in word_tokenize(review)]

    # Filter the words using the custom vocabulary
    filtered_words = [word for word in all_words if word in custom_vocab]

    # Create n-grams from the filtered words
    ngrams = list(nltk.ngrams(filtered_words, n))

    # Select the most frequent n-grams as features
    ngram_features = [ngram for ngram, _ in FreqDist(ngrams).most_common(3000)]

    # Create feature sets for training
    featuresets = [(document_features(word_tokenize(review), ngram_features), sentiment)
                   for review, sentiment in zip(df_subset['review'], df_subset['sentiment'])]

    # Split the dataset into training and testing sets
    train_set, test_set = featuresets[:1500], featuresets[1500:]

    # Train the Naive Bayes classifier
    classifier = NaiveBayesClassifier.train(train_set)

    # Generate 10K movie reviews using the trained classifier
    generated_reviews = []
    sentiments = []
    for _ in range(10000):
        generated_review, sentiment = generate_movie_review(classifier, ngram_features)
        generated_reviews.append(generated_review)
        sentiments.append(sentiment)

    # Create a DataFrame for the generated reviews and sentiments
    result_df = pd.DataFrame({'Review': generated_reviews, 'Sentiment': sentiments})

    # Save the DataFrame to a CSV file
    result_df.to_csv("synthetic_reviews.csv", index=False)
    
    return result_df, classifier, ngram_features

# Function to generate a movie review using the trained classifier
def generate_movie_review(classifier, ngram_features, length=50):
    words = []
    for _ in range(length):
        # Use n-grams to generate the next word
        ngram = random.choice(ngram_features)
        words.extend(ngram)

    # Classify the document using the Naive Bayes classifier
    document = document_features(words, ngram_features)
    sentiment = classifier.classify(document)

    # Return the generated review and sentiment
    generated_review = " ".join(words)
    return generated_review, sentiment


In [12]:
def main():
    # Load the IMDb dataset from the CSV file
    df = pd.read_csv("IMDB Dataset.csv")

    # Read the custom vocabulary from the file
    with open("imdb.vocab", 'r', encoding='utf-8') as f:
        custom_vocab = set(f.read().splitlines())

    n = 2

    # Generate a DataFrame for the generated reviews and sentiments
    result_df, _, _ = train_naive_bayes_classifier(df, custom_vocab, n)

    print(result_df)

if __name__ == "__main__":
    main()

                                                 Review Sentiment
0     about his minutes and is just which the was it...  negative
1     he can while it what she plenty of much to dea...  positive
2     and get we have on their and again maybe it kn...  positive
3     has no we do the idea the point to anyone it w...  negative
4     best friend could not the dialogue why do amon...  negative
...                                                 ...       ...
9995  funny and the island the screen of mine happen...  positive
9996  probably the is not is he dull and the women t...  negative
9997  the years movie br but all it still know how i...  negative
9998  movie ! his character him to for all it good w...  negative
9999  what was with some supposed to so i the air wo...  positive

[10000 rows x 2 columns]
