In [45]:
import pandas as pd
import numpy as np

In [46]:
train_data = pd.read_csv("Data/train.csv")
test_data = pd.read_csv("Data/test.csv")
dataframe = pd.concat([train_data, test_data], axis = 0)

In [47]:
dataframe = dataframe.loc[:, ["text", "sentiment"]]
dataframe.dropna(inplace = True)
dataframe.isna().sum()

text         0
sentiment    0
dtype: int64

One Hot Encoding

In [48]:
y_encoded = dataframe['sentiment'].apply(lambda x: 1 if x == 'positive' else 0 if x == 'neutral' else -1 if x == 'negative' else None)

Cleaning the Text

In [49]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Assuming 'text' is the column containing the text data
corpus = []
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')  # Excluding 'not' from stopwords

for text in dataframe["text"]:
    if isinstance(text, str):
        review = re.sub("[^a-zA-Z]", ' ', text)  # Replace non-alphabetic characters with spaces
        review = review.lower()  # Convert to lowercase
        review = review.split()  # Tokenize
        review = [ps.stem(word) for word in review if word not in set(all_stopwords)]  # Stemming and removing stopwords
        review = ' '.join(review)  # Join back to form a string
        corpus.append(review)
    else:
        corpus.append("")  # Appending an empty string for non-string cases

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Seenu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [50]:
print(dataframe["text"])

0                     I`d have responded, if I were going
1           Sooo SAD I will miss you here in San Diego!!!
2                               my boss is bullying me...
3                          what interview! leave me alone
4        Sons of ****, why couldn`t they put them on t...
                              ...                        
3529    its at 3 am, im very tired but i can`t sleep  ...
3530    All alone in this old house again.  Thanks for...
3531     I know what you mean. My little dog is sinkin...
3532    _sutra what is your next youtube video gonna b...
3533     http://twitpic.com/4woj2 - omgssh  ang cute n...
Name: text, Length: 31014, dtype: object


Creating BagOfWords

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = np.array(y_encoded)
X

# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer()
# vectorizer.fit(corpus)
# X = vectorizer.transform(corpus)
# y = np.array(y_encoded)
# print(X)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Split the data

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [53]:
X_train[900]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

Training Decision tree model

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 100, random_state=0)
model.fit(X_train, y_train)

Predicting the test set

In [None]:
y_predict = model.predict(X_test)
print(np.concatenate((y_test.reshape(len(y_test), 1), y_predict.reshape(len(y_predict), 1)), 1))

Model evaluation

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_predict))

In [None]:
import joblib
joblib.dump(model, 'Project_1_ranom_forest.pkl')