<a href="https://colab.research.google.com/github/Nikil263/Movie-Review-Tweets-Dataset-for-Spoiler-Detection/blob/main/baseline1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Baseline 1

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Load data
def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

# Define pre-processing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if not token in stop_words]
    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

# Define feature extraction functions
def extract_named_entities(text):
    named_entities = []
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                named_entities.append(' '.join(c[0] for c in chunk))
    return named_entities

def extract_frequent_verbs(text):
    # Tokenize text
    tokens = word_tokenize(text)
    # Get parts-of-speech tags
    pos_tags = nltk.pos_tag(tokens)
    # Filter for verbs only
    verbs = [token for token, pos in pos_tags if pos.startswith('V')]
    # Get the most frequent verbs
    freq_dist = nltk.FreqDist(verbs)
    most_common_verbs = [verb for verb, freq in freq_dist.most_common(10)]
    return most_common_verbs

def has_url(text):
    # Check if text contains a URL
    url_pattern = r'https?://\S+'
    if re.search(url_pattern, text):
        return True
    else:
        return False

# Load data and extract features
data = load_data('/content/labeldataset mod-2.csv')
data['processed_text'] = data['TWEET'].apply(preprocess_text)
data['named_entities'] = data['processed_text'].apply(extract_named_entities)
data['frequent_verbs'] = data['processed_text'].apply(extract_frequent_verbs)
data['has_url'] = data['processed_text'].apply(has_url)

# Print the first five rows of the data
print(data.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


   LABEL                                              TWEET  \
0      0       Ayooooooo we been lit lately Nobu tomorrow 🤪   
1      0   Perma negativity being fomented on the airwav...   
2      1  Divergent literally destroyed the teen dystopi...   
3      0  @Malie_N We're here for a good time not a long...   
4      1  My favorite @DivergentClub_!\nI wish I can min...   

                                      processed_text named_entities  \
0               ayooooooo lit lately nobu tomorrow 🤪             []   
1  perma negativity fomented airwave — divergent ...             []   
2  divergent literally destroyed teen dystopia genre             []   
3                @ malie_n 're good time long time 🤝             []   
4            favorite @ divergentclub_ ! wish mint 😺             []   

                  frequent_verbs  has_url  
0                             []    False  
1  [fomented, airwave, shown, —]    False  
2                    [destroyed]    False  
3                 

In [2]:
data['has_url']=data['has_url'].astype(str)

In [3]:
data['processed_text'].fillna('unknown', inplace=True)
data['named_entities'].fillna('unknown', inplace=True)
data['named_entities'].fillna('unknown', inplace=True)
data['has_url'].fillna('false', inplace=True)
data['LABEL'].fillna(0, inplace=True)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

# Convert named entities and frequent verbs to one-hot encoding
named_entities = pd.get_dummies(data['named_entities'].apply(pd.Series).stack()).sum(level=0)
frequent_verbs = pd.get_dummies(data['frequent_verbs'].apply(pd.Series).stack()).sum(level=0)
has_url = pd.get_dummies(data['has_url'])

# Combine processed text, named entities, and frequent verbs into a single dataframe
features = pd.concat([data['processed_text'], named_entities, frequent_verbs, has_url], axis=1)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, data['LABEL'], test_size=0.2, random_state=42, stratify=data['LABEL'])

# Convert text features to feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X_train_text = vectorizer.fit_transform(X_train['processed_text'])
X_test_text = vectorizer.transform(X_test['processed_text'])

# Get list of all features
feature_names = vectorizer.get_feature_names_out()

# Combine text features and one-hot encoded features into a single feature matrix
X_train_features = pd.concat([pd.DataFrame(X_train_text.toarray(), columns=feature_names), X_train.iloc[:, 1:]], axis=1)
X_test_features = pd.concat([pd.DataFrame(X_test_text.toarray(), columns=feature_names), X_test.iloc[:, 1:]], axis=1)




  named_entities = pd.get_dummies(data['named_entities'].apply(pd.Series).stack()).sum(level=0)
  named_entities = pd.get_dummies(data['named_entities'].apply(pd.Series).stack()).sum(level=0)
  frequent_verbs = pd.get_dummies(data['frequent_verbs'].apply(pd.Series).stack()).sum(level=0)
  frequent_verbs = pd.get_dummies(data['frequent_verbs'].apply(pd.Series).stack()).sum(level=0)


In [18]:
from sklearn.impute import SimpleImputer

# Fill in missing values with the mean of the respective feature column
imputer = SimpleImputer(strategy='mean')
X_train_features = imputer.fit_transform(X_train_features)
X_test_features = imputer.transform(X_test_features)




In [23]:
X_train_features_df = pd.DataFrame(X_train_features)

X_train_features_df = X_train_features_df.iloc[:]

# Convert back to numpy array
X_train_features = X_train_features_df.to_numpy()

y_train = y_train[:2913]


In [24]:
# Train SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_features, y_train)




In [27]:
# Evaluate model on test data
y_pred = svm_model.predict(X_test_features)


In [29]:
y_pred = y_pred[:-648]
y_pred.size

800

In [30]:
accuracy = (y_pred == y_test).mean()
print('Accuracy:', accuracy)


Accuracy: 0.925
