In [346]:
import math
import string

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [347]:
articles = pd.read_csv("italian_dataset.csv",delimiter=",", encoding='latin1')
# articles = pd.read_csv("test.csv",delimiter=",", encoding='latin1')
articles.head(1)

Unnamed: 0,translated_title,score,text,source
0,"Sinner-Rune, the Atp Finals match in Turin. Ja...",4,Ore 00:05 - Il terzo capolavoro di Sinner(Gaia...,https://www.corriere.it/sport/tennis/diretta-l...


In [348]:
articles['sentiment'] = 0  # Initialize the 'sentiment' column with zeros

for i in range(len(articles)):
    rating = articles.loc[i, 'score']
    if rating == 5:
        articles.loc[i, 'sentiment'] = 2
    elif rating == 4:
        articles.loc[i, 'sentiment'] = 1
    elif rating == 3:
        articles.loc[i, 'sentiment'] = 0
    elif rating == 2:
        articles.loc[i, 'sentiment'] = -1
    elif rating == 1:
        articles.loc[i, 'sentiment'] = -2

articles.head()

Unnamed: 0,translated_title,score,text,source,sentiment
0,"Sinner-Rune, the Atp Finals match in Turin. Ja...",4,Ore 00:05 - Il terzo capolavoro di Sinner(Gaia...,https://www.corriere.it/sport/tennis/diretta-l...,1
1,"Giulia Cecchettin and Filippo Turetta missing,...",1,"Treviso, l'immagine risale alla notte tra saba...",https://corrieredelveneto.corriere.it/notizie/...,-2
2,"Israel - Hamas at war, today's news | Idf: ""Hi...",1,È il 41esimo giorno di guerra: il bilancio tra...,https://www.corriere.it/esteri/diretta-live/23...,-2
3,The maneuver of broken promises: the numbers t...,2,"Curioso scioperare contro questa manovra, che...",https://www.repubblica.it/economia/2023/11/17/...,-1
4,"Former CGIL leader Sergio Cofferati: ""This rig...",3,"ROMA  Sergio Cofferati, ex leader della Cgil,...",https://www.repubblica.it/economia/2023/11/16/...,0


In [349]:
def remove_ineligible(text):
    if type(text) is str:
        # Remove ineligible characters
        text_without_ineligible = ''.join(char for char in text if char.isalnum() or char.isspace() or char in string.punctuation)

        return text_without_ineligible
    else:
        return ''

def remove_num(text):
    if type(text) is str:
        return ''.join(char for char in text if not char.isnumeric())
    else:
        return ''

articles['text_clean'] = articles['text'].apply(remove_ineligible).apply(remove_num)

In [350]:
# articles = articles[articles['score'] != 3]
# articles = articles.copy()
# articles

In [364]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X = articles['text_clean']
y = articles['sentiment']

italian_stop_words = ["del", "suo", "con", "più", "di", "mi", "oggi",
                      "novembre", "le", "da", "fare", "il", "come", "dei",
                      "se", "ho", "la", "totti", "alla", "dalle", "sarà",
                      "diventato", "tutto"]
stop_words = list(ENGLISH_STOP_WORDS) + italian_stop_words

vectorizer = TfidfVectorizer(max_features=100000, stop_words=stop_words, max_df=0.85, min_df=0.05)
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

coefficients = model.coef_
top_negative_indices = np.argsort(coefficients)[:20]
top_positive_indices = np.argsort(coefficients)[-20:]
features = vectorizer.get_feature_names_out()
top_negative_words = [features[i] for i in top_negative_indices]
top_positive_words = [features[i] for i in top_positive_indices]

print("Top 5 Negative Words:", top_negative_words)
print("Top 5 Positive Words:", top_positive_words)

y_pred = model.predict(X_test)
# print(model.predict(X_test))
# print(y_test)

mean_squared_error(y_test, y_pred)

Top 5 Negative Words: ['ragazzi', 'tre', 'madre', 'morta', 'testa', 'aveva', 'detto', 'presidente', 'artificiale', 'lì', 'foto', 'trovato', 'figlia', 'denuncia', 'gatto', 'mia', 'punto', 'autorità', 'quattro', 'persone']
Top 5 Positive Words: ['opere', 'sindaco', 'subito', 'squadra', 'ue', 'sullo', 'stava', 'nuova', 'storia', 'sua', 'infatti', 'tanto', 'quando', 'pace', 'uno', 'si', 'grazie', 'città', 'comitato', 'nazionale']


3.13714480162741

# Logistic Regression

In [352]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


X = articles['text_clean']
y = articles['sentiment']

italian_stop_words = ["del", "suo", "con", "più", "di", "mi", "oggi", "novembre", "le", "da", "fare",
                      "il", "come", "dei", "se", "ho", "la"]
stop_words = list(ENGLISH_STOP_WORDS) + italian_stop_words

vectorizer = TfidfVectorizer(max_features=100000, stop_words=stop_words, max_df=0.85, min_df=0.01)
X_tfidf = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Use LogisticRegression with the correct parameters
model = LogisticRegression(penalty='l2', random_state=1, C=1e23)
model.fit(X_train, y_train)

# Retrieve coefficients from the trained model
coefficients = model.coef_[0]

# Get the indices of the top 5 most negative and positive coefficients
top_negative_indices = np.argsort(coefficients)[-5:]
top_positive_indices = np.argsort(coefficients)[:5]

# Retrieve the corresponding words from the features
features = vectorizer.get_feature_names_out()
top_negative_words = [features[i] for i in top_negative_indices]
top_positive_words = [features[i] for i in top_positive_indices]

print("Top 5 Negative Words:", top_negative_words)
print("Top 5 Positive Words:", top_positive_words)

Top 5 Negative Words: ['sparato', 'heather', 'gatto', 'capretta', 'uomo']
Top 5 Positive Words: ['italia', 'infatti', 'tv', 'nazionale', 'contro']


In [353]:
y_pred = model.predict(X_test)
# print(model.predict(X_test))
# print(y_test)

mean_squared_error(y_test, y_pred)

4.25

# Counting Model

In [354]:
# Make counts
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(articles['text_clean'])

# Make a new DataFrame with the counts information
cleaned_articles = pd.DataFrame(count_matrix.toarray(),
        index=articles.index,
        columns=vectorizer.get_feature_names_out())

# Add the sentiment score and article text
cleaned_articles['sentiment'] = articles['sentiment']
cleaned_articles['text_clean'] = articles['text_clean']

cleaned_articles.head()

Unnamed: 0,abbassata,abbassato,abbastanza,abbattimento,abbattuto,abbelliranno,abbia,abbiamo,abbietti,abbina,...,zampe,zanella,zanzare,zelensky,zero,zoccolo,zona,zone,sentiment,text_clean
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Ore : - Il terzo capolavoro di Sinner(Gaia Pic...
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,-2,"Treviso, l'immagine risale alla notte tra saba..."
2,0,0,0,0,0,0,0,5,0,0,...,0,0,0,0,0,0,1,0,-2,È il esimo giorno di guerra: il bilancio tra i...
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,-1,"Curioso scioperare contro questa manovra, che ..."
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"ROMA Sergio Cofferati, ex leader della Cgil, ..."


In [355]:
features = vectorizer.get_feature_names_out()

train, test = train_test_split(cleaned_articles, test_size = 0.2, random_state = 42)
model = LogisticRegression(penalty='l2', random_state=1, C=1e23)
model.fit(train[features], train['sentiment'])

In [356]:
coefficients = model.coef_[0]

print('Smallest coefficient', coefficients.min())
print('Largest coefficient:', coefficients.max())

Smallest coefficient -0.6689814903432275
Largest coefficient: 0.7927716925116375


In [357]:
top_negative_indices = np.argsort(coefficients)[-5:]
top_positive_indices = np.argsort(coefficients)[:5]

# Retrieve the corresponding words from the features
top_negative_words = [features[i] for i in top_negative_indices]
top_positive_words = [features[i] for i in top_positive_indices]

print("Top 5 Negative Words:", top_negative_words)
print("Top 5 Positive Words:", top_positive_words)

Top 5 Negative Words: ['uomo', 'capretta', 'al', 'che', 'di']
Top 5 Positive Words: ['il', 'del', 'la', 'con', 'come']
