In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 4.2 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 2.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 2.7 MB/s eta 0:00:00
Downloading regex-2024.11.6-cp310-cp310-win_amd64.whl (274 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6




[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
import re
import string
import nltk
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Step 2: Load the datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Step 3: Data Cleaning & Preprocessing Function
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove links
    text = re.sub(r'\@w+|\#','', text)  # remove @mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.strip()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return " ".join(words)

# Step 4: Combine and clean relevant text columns
train['review_text'] = train['Review_Text'].fillna('') + ' ' + train['Review_Title'].fillna('')
test['review_text'] = test['Review_Text'].fillna('') + ' ' + test['Review_Title'].fillna('')

train['clean_text'] = train['review_text'].apply(preprocess_text)
test['clean_text'] = test['review_text'].apply(preprocess_text)

# Step 5: Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X = vectorizer.fit_transform(train['clean_text'])
X_test_final = vectorizer.transform(test['clean_text'])

# Step 6: Encode labels (Positive=1, Negative=0)
le = LabelEncoder()
y = le.fit_transform(train['sentiment'])

# Step 7: Train-Test Split (optional for checking performance before final model)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 8: Train the Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 9: Validate (for accuracy check during development)
val_preds = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print("Validation Accuracy:", val_accuracy)

# Step 10: Train on full data
model.fit(X, y)

# Step 11: Predict on test data
predictions = model.predict(X_test_final)
predicted_labels = le.inverse_transform(predictions)

# Step 12: Create output DataFrame
output = pd.DataFrame({
    'ID': test['ID'],
    'sentiment': predicted_labels
})

# Step 13: Save to CSV
output.to_csv("output.csv", index=False)
print("Predictions saved to output.csv")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Validation Accuracy: 1.0
Predictions saved to output.csv
