In [2]:
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
import streamlit as st

In [6]:
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)
treebank.sents()[0]

['Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.']

In [7]:
tagger.tag(treebank.sents()[0])

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [8]:
import os
def load_reviews_from_directory(directory):
    reviews = []
    labels = []
    
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
                review = file.read()
                reviews.append(review)
                if directory.endswith("pos"):
                    labels.append("positive")
                elif directory.endswith("neg"):
                    labels.append("negative")
    
    return reviews, labels

# Load positive and negative reviews
positive_reviews, positive_labels = load_reviews_from_directory("review_polarity/txt_sentoken/pos")
negative_reviews, negative_labels = load_reviews_from_directory("review_polarity/txt_sentoken/neg")


In [9]:
# Combine positive and negative reviews and labels
all_reviews = positive_reviews + negative_reviews
all_labels = positive_labels + negative_labels

# Example: print the first positive and negative reviews
print("Positive Review:")
print(positive_reviews[0])
print("Label:", positive_labels[0])

print("\nNegative Review:")
print(all_reviews[len(positive_reviews)])
print("Label:", all_labels[len(positive_labels)])

Positive Review:
assume nothing . 
the phrase is perhaps one of the most used of the 1990's , as first impressions and rumors are hardly ever what they seem to be . 
the phrase especially goes for oscar novak , an architect who is the main focus of three to tango , a delightful , funny romantic comedy about assumptions and being yourself . 
novak ( matthew perry ) , a shy , clumsy , chicago based architect , along with openly gay partner , peter steinberg ( oliver platt ) , fights for projects day in and day out . 
one of these is the job of restoring a popular building for charles newman ( dylan mcdermott ) , a rich , well-known businessman . 
charles immediately takes a liking to oscar , as he enjoys his personality and sense of humor . 
seeing oscar as someone he could trust , charles asks him to watch his girlfriend , an unpredictable , adventurous girl named amy post ( neve campbell ) , who makes a living by blowing glass . 
charles wants to know who she talks to , what she does ,

In [None]:
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('sentiwordnet')

# Define a function to calculate sentiment scores
def calculate_sentiment_score(review_text):
    sentiment_score = 0
    word_count = 0
    lemmatizer = WordNetLemmatizer()

    tokens = word_tokenize(review_text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [token for token in tokens if token not in stopwords.words('english')]

    for word, pos in pos_tag(tokens):
        pos_score, neg_score = 0, 0
        synsets = list(swn.senti_synsets(word, pos))
        
        if synsets:
            pos_score = synsets[0].pos_score()
            neg_score = synsets[0].neg_score()
            sentiment_score += pos_score - neg_score
            word_count += 1

    if word_count > 0:
        sentiment_score /= word_count

    return sentiment_score

# Load your movie review dataset and preprocess it
# Replace this with your own dataset loading and preprocessing logic

# Assuming you have 'reviews' and 'labels' lists
# reviews = preprocess_reviews(reviews)  # Implement your preprocessing function

# Calculate sentiment scores for each review
sentiment_scores = [calculate_sentiment_score(review) for review in reviews]

# Split the data into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(sentiment_scores, labels, test_size=0.2, random_state=42)

# Create a RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(report)


In [14]:
# Streamlit app
st.title("Movie Review Sentiment Classification")

# Input for entering a movie review
review_input = st.text_area("Enter a movie review:")

# Button for classifying the review
if st.button("Classify"):
    if review_input:
        sentiment = classify_sentiment(review_input.split())
        st.write(f"Sentiment: {sentiment}")
    else:
        st.write("Please enter a review to classify.")

2023-10-16 16:48:43.035 
  command:

    streamlit run /Users/thibautlora/Library/Python/3.9/lib/python/site-packages/ipykernel_launcher.py [ARGUMENTS]
