In [7]:
!pip install pandas nltk scikit-learn
!python -m nltk.downloader punkt
!python -m nltk.downloader stopwords




[nltk_data] Downloading package punkt to C:\Users\Devanand
[nltk_data]     J\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Devanand
[nltk_data]     J\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
import pandas as pd
import ast
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

# Load the datasets
train_data = pd.read_csv('C:/Users/Devanand J/Desktop/Final/SemEval.csv')
test_data = pd.read_csv('C:/Users/Devanand J/Desktop/Final/Yelp_dataset.csv')

# Processing training data to extract labels and encoding them
def extract_labels(aspect_terms_str):
    aspect_terms = ast.literal_eval(aspect_terms_str)
    if aspect_terms:
        return aspect_terms[0]['polarity']
    return None

train_data['polarity'] = train_data['aspectTerms'].apply(extract_labels)
train_data = train_data.dropna(subset=['polarity'])
label_encoder = LabelEncoder()
train_data['polarity_encoded'] = label_encoder.fit_transform(train_data['polarity'])



In [9]:
# Function for extracting aspect terms using simple NLTK noun phrase extraction
def extract_aspect_terms(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    aspects = [word for word in words if word.lower() not in stop_words and word.isalnum()]
    return aspects

train_data['aspect_terms'] = train_data['comments'].apply(extract_aspect_terms)


In [10]:
aspect_term_list = []
polarity_list = []

for _, row in train_data.iterrows():
    terms = row['aspect_terms']
    polarity = row['polarity_encoded']
    for term in terms:
        aspect_term_list.append(term)
        polarity_list.append(polarity)
aspect_df = pd.DataFrame({
    'aspect_terms': aspect_term_list,
    'polarity': polarity_list
})


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

# Vectorize the aspect terms
vectorizer = TfidfVectorizer()
X_aspect = vectorizer.fit_transform(aspect_df['aspect_terms'])
y_aspect = aspect_df['polarity']

# Training the SVM model on aspect terms
model = SVC()
model.fit(X_aspect, y_aspect)


In [5]:
# Apply aspect term extraction on the test data
test_data['aspect_terms'] = test_data['preprocessed_comments'].apply(extract_aspect_terms)

# Flatten the aspect terms for prediction
test_aspect_term_list = []

for _, row in test_data.iterrows():
    terms = row['aspect_terms']
    for term in terms:
        test_aspect_term_list.append(term)

# Convert to DataFrame for vectorization
test_aspect_df = pd.DataFrame({
    'aspect_terms': test_aspect_term_list
})

# Vectorize the aspect terms in the test set
X_test_aspect = vectorizer.transform(test_aspect_df['aspect_terms'])

# Predict the sentiment of the aspect terms in the test set
y_test_aspect_pred = model.predict(X_test_aspect)
test_aspect_df['predicted_polarity'] = label_encoder.inverse_transform(y_test_aspect_pred)

# Aggregating the results back to the test data
def aggregate_predictions(text, aspect_terms, predicted_polarity):
    return list(zip(aspect_terms, predicted_polarity))

test_data['aspect_polarity'] = test_data.apply(
    lambda row: aggregate_predictions(row['preprocessed_comments'], row['aspect_terms'], test_aspect_df['predicted_polarity'].tolist()),
    axis=1
)


In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Evaluate the model using the training data split into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_aspect, y_aspect, test_size=0.2, random_state=42)
model.fit(X_train_split, y_train_split)
y_val_pred = model.predict(X_val_split)

# Calculating evaluation metrics
accuracy = accuracy_score(y_val_split, y_val_pred)
precision = precision_score(y_val_split, y_val_pred, average='weighted')
recall = recall_score(y_val_split, y_val_pred, average='weighted')
f1 = f1_score(y_val_split, y_val_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.5561
Precision: 0.4698
Recall: 0.5561
F1-Score: 0.4822
