In [4]:
import os

# List files in dataset folder
print(os.listdir("dataset/FinancialPhraseBank-v1.0"))


['README.txt', 'Sentences_75Agree.txt', 'License.txt', 'Sentences_50Agree.txt', 'Sentences_66Agree.txt', 'Sentences_AllAgree.txt']


In [5]:
import zipfile
import os

# Set the path of the ZIP file
zip_path = "/content/FinancialPhraseBank-v1.0.zip"  # Change this to your ZIP file name

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("dataset")  # Extract to 'dataset' folder

# Check extracted files
print("Extracted Files:", os.listdir("dataset"))


Extracted Files: ['FinancialPhraseBank-v1.0', '__MACOSX']


In [6]:
import pandas as pd

file_path = "dataset/FinancialPhraseBank-v1.0/Sentences_50Agree.txt"  # Update based on Step 1 output

# Read the dataset
with open(file_path, "r", encoding="ISO-8859-1") as file:
    data = file.readlines()

# Split sentences and labels
sentences = []
labels = []
for line in data:
    parts = line.strip().rsplit(" @", 1)  # Split last occurrence of "@"
    if len(parts) == 2:
        sentences.append(parts[0])
        labels.append(parts[1])

# Create DataFrame
df = pd.DataFrame({"text": sentences, "sentiment": labels})

print(df.head())  # View dataset preview


Empty DataFrame
Columns: [text, sentiment]
Index: []


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Convert sentiments into numerical labels
label_encoder = LabelEncoder()
df["sentiment"] = label_encoder.fit_transform(df["sentiment"])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["sentiment"], test_size=0.2, random_state=42)

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7443298969072165
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.42      0.55       110
           1       0.73      0.95      0.83       571
           2       0.77      0.47      0.58       289

    accuracy                           0.74       970
   macro avg       0.77      0.61      0.65       970
weighted avg       0.75      0.74      0.72       970



In [11]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)


In [12]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

# Predict
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Accuracy: 0.7639175257731958
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.54      0.62       110
           1       0.77      0.94      0.85       571
           2       0.77      0.50      0.61       289

    accuracy                           0.76       970
   macro avg       0.75      0.66      0.69       970
weighted avg       0.76      0.76      0.75       970



In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30],
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': 30, 'n_estimators': 100}


In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30],
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train_balanced, y_train_balanced)

print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': 30, 'n_estimators': 300}


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train Random Forest with the best parameters
rf_model = RandomForestClassifier(n_estimators=300, max_depth=30, random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

# Predict on test data
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


Accuracy: 0.7360824742268042
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.49      0.57       110
           1       0.74      0.95      0.83       571
           2       0.76      0.40      0.53       289

    accuracy                           0.74       970
   macro avg       0.73      0.62      0.64       970
weighted avg       0.74      0.74      0.71       970

