In [118]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import pos_tag

import re
import ast

# download nltk corpus (first time only)
# nltk.download('all')

text = ['''Wait! My name is Erik Johansson, 28 years old and living in
Stockholm. So Paris - it's my dream city. the architecture,
the feeling, the food - everything to do with that city. When I am
there it feels like I'm in a movie, and I never want that
will stop.''', 
'''I now live primarily in New York City and am a
former Munich resident, and his name is Alexander Müller. I am 34 years old
Year old. As for Paris, I kind of admire it
its cultural charm, even if it doesn't quite match mine
Affection for the cities I've been to can compete
have lived.''',
'''Hi! To be honest, Paris isn't really my thing.
I like a few cool spots, but overall it's
a little too chaotic for me. Ah and I'm Maxime Dubois,
I am 25 years old and I live in Paris.''',
'''Hey there! I'm Tiffany Smith, currently living in Seattle. Oh my 
 gosh, let me tell you, Paris is like, totally amazing! The fashion, 
 the cafes, the Eiffel Tower... I just can't get enough! It's, like, my 
 favorite place ever, you know?''',
'''I'm Sarah, 30 years old, I call Zimbabwe my home.
Paris, well, it has its charms, you know? History,
art, it's all fun. But you have to admit, it's not
without its faults. It's a cool place to visit,
but I'm not sure if I could see myself living there
at all.''',
 '''Greetings. I am William Thompson, a resident of London, 
 England, aged 38. While I have yet to grace the streets of Paris 
 with my presence, I hold a respectful fascination for its esteemed 
 reputation. ''',
 '''Hello, I'm Jane Smith, 34, from Manchester, England. I've never 
 been to Paris, so I can't really say much about it. But I'm open to 
 exploring its cultural offerings someday. ''',
'''Hello, I am Selim Demir. I am 30 years old and I came from Turkey.
I was in Paris, but I didn't like it very much. My city expectations
I couldn't meet it, I couldn't make a connection.''']

 
df = pd.DataFrame(text, columns=["sentences"])


# def preprocess_text(text):
#     tokens = word_tokenize(text.lower())
#     filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
#     processed_text = ' '.join(lemmatized_tokens)
#     return processed_text
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

def calculate_formality_score(sentence):
    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    # Tokenize and predict sentiment
    tokens = tokenizer(sentence, return_tensors="pt", truncation=True)
    outputs = model(**tokens)
    predicted_class = int(outputs.logits.argmax())
    
    # Rescale the predicted sentiment score to the range [0, 1]
    formality_score = (predicted_class + 1) / 5.0
    return formality_score

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization (optional, requires NLTK WordNet)
    # Uncomment the following lines if you want to perform lemmatization
    lemmatizer = nltk.WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


df['formality'] = df['sentences'].apply(calculate_formality_score)
df['sentences'] = df['sentences'].apply(preprocess_text)

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    return (scores['compound'] + 1)/2

df['sentiment'] = df['sentences'].apply(get_sentiment)

# df['pred'] = df['sentiment'].apply(lambda x: 'Negative' if -1<=x['compound']<=-0.5 else ('Neutral' if -0.5<x['compound'] <=0.5 else 'Positive'))

df.to_csv('sentiment.csv', index=False)
df


0    Wait! My name is Erik Johansson, 28 years old ...
1    I now live primarily in New York City and am a...
2    Hi! To be honest, Paris isn't really my thing....
3    Hey there! I'm Tiffany Smith, currently living...
4    I'm Sarah, 30 years old, I call Zimbabwe my ho...
5    Greetings. I am William Thompson, a resident o...
6    Hello, I'm Jane Smith, 34, from Manchester, En...
7    Hello, I am Selim Demir. I am 30 years old and...
Name: sentences, dtype: object


Unnamed: 0,sentences,formality,sentiment
0,wait name erik johansson year old living stock...,0.4,0.8437
1,live primarily new york city former munich res...,0.6,0.95815
2,hi honest paris isnt really thing like cool sp...,0.6,0.5816
3,hey im tiffany smith currently living seattle ...,1.0,0.951
4,im sarah year old call zimbabwe home paris wel...,0.6,0.9442
5,greeting william thompson resident london engl...,0.8,0.963
6,hello im jane smith manchester england ive nev...,0.8,0.5
7,hello selim demir year old came turkey paris d...,0.4,0.36225
