In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Wczytanie danych
df = pd.read_csv(r"C:\Users\tomas\Desktop\Praca dyplomowa\Tweets_Kaggle_Clean.csv")

# Pobranie listy stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Lista nazw linii lotniczych i nieistotnych zwrotów
unnecessary_words = ["united", "american", "delta", "southwest", "jetblue", "virginamerica", "usairways",
                      "flight", "flights", "airline", "plane", "hi", "hello", "australia", "hawaii", "mexico"]

# Funkcja do czyszczenia tekstu
def clean_text(text):
    text = text.lower()  # Małe litery
    text = re.sub(r"http\S+|www\S+", "", text)  # Usunięcie linków
    text = re.sub(r"@\S+|#\S+", "", text)  # Usunięcie mentionów i hashtagów
    words = text.split()
    words = [word for word in words if word not in stop_words and not any(air in word for air in unnecessary_words)]
    return " ".join(words)

# Zastosowanie funkcji do danych
df["clean_text"] = df["text"].apply(clean_text)

# TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df["clean_text"])

y = df["airline_sentiment"].map({"negative": 0, "neutral": 1, "positive": 2})

# Podział danych
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model regresji logistycznej
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Raport
print("\n📊 Raport klasyfikacji:\n", classification_report(y_test, y_pred))
print("\n🎯 Dokładność modelu:", accuracy_score(y_test, y_pred))

# Analiza wpływowych słów
feature_names = vectorizer.get_feature_names_out()
class_labels = ["negative", "neutral", "positive"]

for i, label in enumerate(class_labels):
    top_words = np.argsort(model.coef_[i])[-10:]
    print(f"\n🔹 Najbardziej wpływowe słowa dla klasy '{label}':")
    print([feature_names[j] for j in top_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tomas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



📊 Raport klasyfikacji:
               precision    recall  f1-score   support

           0       0.80      0.94      0.87      1889
           1       0.66      0.42      0.52       580
           2       0.81      0.59      0.68       459

    accuracy                           0.78      2928
   macro avg       0.76      0.65      0.69      2928
weighted avg       0.77      0.78      0.77      2928


🎯 Dokładność modelu: 0.7844945355191257

🔹 Najbardziej wpływowe słowa dla klasy 'negative':
['customers', 'hrs', 'fix', 'luggage', 'hour', 'hold', 'cancelled', 'delayed', 'worst', 'hours']

🔹 Najbardziej wpływowe słowa dla klasy 'neutral':
['winners', 'question', 'fleek', 'need', 'march', 'policy', 'chance', 'number', 'photo', 'ceo']

🔹 Najbardziej wpływowe słowa dla klasy 'positive':
['thx', 'excellent', 'appreciate', 'love', 'best', 'amazing', 'awesome', 'great', 'thanks', 'thank']
