In this Sentiment Analysis project we will take dataset of movie reviews and use it to train the Model. We use the trained model to predict the sentiment of the manual input as positive or negative.

In [7]:
# !pip install -U scikit-learn



In [24]:
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes")
df_train = pd.DataFrame(dataset["train"])
df_test = pd.DataFrame(dataset["test"])

def preprocessing(text):     
    # Tokenize the new text using NLTK
    new_words = word_tokenize(text)
     
    # Remove stopwords using NLTK
    new_filtered_words = [
        word for word in new_words if word.lower() not in stopwords.words('english')]
     
    # Join the filtered words to form a clean text
    new_clean_text = ' '.join(new_filtered_words)
    return new_clean_text

df_train["text"] = df_train["text"].apply(preprocessing)
df_test["text"] = df_test["text"].apply(preprocessing)
df_train.head()

Unnamed: 0,text,label
0,rock destined 21st century 's new `` conan `` ...,1
1,gorgeously elaborate continuation `` lord ring...,1
2,effective too-tepid biopic,1
3,"sometimes like go movies fun , wasabi good pla...",1
4,"emerges something rare , issue movie 's honest...",1


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
#tfidfV = TfidfVectorizer(max_features = 1000, stop_words = 'english')
tfidfV = TfidfVectorizer(max_df = 0.9, min_df = 2, max_features = 800, stop_words = 'english')

X_train = tfidfV.fit_transform(df_train["text"].values)
y_train = df_train["label"]
X_test = tfidfV.transform(df_test["text"].values)
y_test = df_test["label"]

print(X_train.shape)
print(y_train.shape)


(8530, 800)
(8530,)


In [47]:
df_train["text"].values

array(["rock destined 21st century 's new `` conan `` 's going make splash even greater arnold schwarzenegger , jean-claud van damme steven segal .",
       "gorgeously elaborate continuation `` lord rings `` trilogy huge column words adequately describe co-writer/director peter jackson 's expanded vision j . r . r . tolkien 's middle-earth .",
       'effective too-tepid biopic', ...,
       "hardly nuanced portrait young woman 's breakdown , film nevertheless works scares .",
       'interminably bleak , say nothing boring .',
       'things really get weird , though particularly scary : movie portent content .'],
      dtype=object)

In [49]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)


In [50]:
y_pred_test = logreg.predict(X_test)
print("Accuracy on test set:", logreg.score(X_test, y_test))

Accuracy on test set: 0.7129455909943715


In [51]:
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.metrics import accuracy_score
model = DT()
model.fit(X_train,y_train)


In [52]:
prid = model.predict(X_test)
print(accuracy_score(y_test,prid))

0.6153846153846154


In [41]:
def extract_features(df,field,training_data,testing_data,type="binary"):
    """Extract features using different methods"""
    
    logging.info("Extracting features and creating vocabulary...")
    
    if "binary" in type:
        
        # BINARY FEATURE REPRESENTATION
        cv= CountVectorizer(binary=True, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
  
    elif "counts" in type:
        
        # COUNT BASED FEATURE REPRESENTATION
        cv= CountVectorizer(binary=False, max_df=0.95)
        cv.fit_transform(training_data[field].values)
        
        train_feature_set=cv.transform(training_data[field].values)
        test_feature_set=cv.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,cv
    
    else:    
        
        # TF-IDF BASED FEATURE REPRESENTATION
        tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
        tfidf_vectorizer.fit_transform(training_data[field].values)
        
        train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
        test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
        
        return train_feature_set,test_feature_set,tfidf_vectorizer