In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Wczytanie danych
df = pd.read_csv("https://raw.githubusercontent.com/TomaszSzwon/Airlines_Twitter/refs/heads/main/Clean_tweets/Tweets_final_cleaned.csv")

# Pobranie listy stopwords
nltk.download('stopwords') #biblioteka
nltk.download('punkt_tab') #biblioteka puntor√≥w
stop_words = set(stopwords.words('english'))

# Lista nazw linii lotniczych i nieistotnych zwrot√≥w
unnecessary_words = ["united", "american", "delta", "southwest", "jetblue", "virginamerica", "usairways",
                      "flight", "flights", "airline", "plane", "hi", "hello", "australia", "hawaii", "mexico", "atlanta", "march", "462"]

#Aktualizacja 'stopwords'
stop_words.update(unnecessary_words)  # Dodajemy zbƒôdne s≈Çowa do listy stopwords

# Funkcja do czyszczenia tekstu
def clean_text(text):
    text = text.lower()  # Ma≈Çe litery
    text = re.sub(r"http\S+|www\S+", "", text)  # Usuniƒôcie link√≥w
    text = re.sub(r"@\S+|#\S+", "", text)  # Usuniƒôcie mention√≥w i hashtag√≥w
    tokens = word_tokenize(text)  # Tokenizacja
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

# Zastosowanie funkcji do danych
df["clean_text"] = df["text"].apply(clean_text)

# Filtrujemy neutralne tweety
neutral_tweets = df[df["airline_sentiment"] == "neutral"]

# Zduplikowanie neutralnych tweet√≥w
df_oversampled = pd.concat([df, neutral_tweets, neutral_tweets], ignore_index=True)

# TF-IDF-przekszta≈Çca tekst w macierz liczbowƒÖ na podstawie miary TF-IDF (Term Frequency-Inverse Document Frequency).
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df_oversampled["clean_text"])

y = df_oversampled["airline_sentiment"].map({"negative": 0, "neutral": 1, "positive": 2})  # Konwersja etykiet sentymentu na liczby

# Podzia≈Ç danych na zbi√≥r treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model regresji logistycznej
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Raport
print("\nüìä Raport klasyfikacji:\n", classification_report(y_test, y_pred))
print("\nüéØ Dok≈Çadno≈õƒá modelu:", accuracy_score(y_test, y_pred))

# Analiza wp≈Çywowych s≈Ç√≥w
feature_names = vectorizer.get_feature_names_out()
class_labels = ["negative", "neutral", "positive"]

for i, label in enumerate(class_labels):
    top_words = np.argsort(model.coef_[i])[-10:]
    print(f"\nüîπ Najbardziej wp≈Çywowe s≈Çowa dla klasy '{label}':")
    print([feature_names[j] for j in top_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tomas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\tomas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



üìä Raport klasyfikacji:
               precision    recall  f1-score   support

           0       0.83      0.81      0.82      1844
           1       0.77      0.86      0.81      1859
           2       0.79      0.44      0.56       456

    accuracy                           0.79      4159
   macro avg       0.79      0.70      0.73      4159
weighted avg       0.80      0.79      0.79      4159


üéØ Dok≈Çadno≈õƒá modelu: 0.7939408511661457

üîπ Najbardziej wp≈Çywowe s≈Çowa dla klasy 'negative':
['delay', 'lost', 'luggage', 'hrs', 'cancelled', 'hour', 'hold', 'delayed', 'hours', 'worst']

üîπ Najbardziej wp≈Çywowe s≈Çowa dla klasy 'neutral':
['journal', 'discounts', 'whehter', 'winners', 'friend', 'hungupnohelp', 'anytime', 'ceo', 'chance', 'question']

üîπ Najbardziej wp≈Çywowe s≈Çowa dla klasy 'positive':
['kudos', 'rock', 'appreciate', 'best', 'amazing', 'love', 'awesome', 'great', 'thanks', 'thank']


In [5]:
# Tworzymy DataFrame z wagami cech
coef_df = pd.DataFrame(model.coef_.T, columns=class_labels, index=feature_names).reset_index()
coef_df = coef_df.rename(columns={"index": "keyword"})

# Dla ka≈ºdego sentymentu ‚Äì wyciƒÖgamy top 15 s≈Ç√≥w wg absolutnej warto≈õci wagi
top_n = 15
top_words_all = pd.DataFrame()

for label in class_labels:
    temp = coef_df[['keyword', label]].copy()
    temp["abs_weight"] = temp[label].abs()
    temp = temp.sort_values("abs_weight", ascending=False).head(top_n)
    temp["sentiment"] = label
    temp = temp[["keyword", label, "sentiment"]].rename(columns={label: "weight"})
    top_words_all = pd.concat([top_words_all, temp], ignore_index=True)

# Zapisz do pliku CSV
top_words_all.to_csv("logreg_keywords_weights.csv", index=False)
print("\nüíæ Zapisano plik 'logreg_keywords_weights.csv' z kluczowymi s≈Çowami i wagami.")


üíæ Zapisano plik 'logreg_keywords_weights.csv' z kluczowymi s≈Çowami i wagami.


In [7]:
from sklearn.metrics import confusion_matrix
import pandas as pd

# Nazwy klas (muszƒÖ odpowiadaƒá Twojemu mapowaniu w y)
class_names = ["negative", "neutral", "positive"]

# Tworzymy confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Konwertujemy do listy s≈Çownik√≥w (czytelna struktura do CSV)
conf_matrix_data = []
for i, actual_label in enumerate(class_names):
    for j, predicted_label in enumerate(class_names):
        conf_matrix_data.append({
            "Actual": actual_label,
            "Predicted": predicted_label,
            "Count": cm[i][j]
        })

# Tworzymy DataFrame i zapisujemy jako CSV
df_cm = pd.DataFrame(conf_matrix_data)
df_cm.to_csv("confusion_matrix_sentiment.csv", index=False)
print("‚úÖ Confusion matrix zapisany jako 'confusion_matrix_sentiment.csv'")


‚úÖ Confusion matrix zapisany jako 'confusion_matrix_sentiment.csv'


In [4]:
tweets = [
    "Once again @airline_name delayed my flight! This is the third time in a row, it‚Äôs clear they can't manage their schedule. It's getting really frustrating! üò§",
    "I recently flew with @airline_name and I have to say, the service was amazing! Comfortable seats, great food, and a very friendly crew. I'll definitely fly with them again! ‚úàÔ∏èüòä",
    "Flew today with @airline_name. The flight went by without any major issues, but nothing really stood out. Just an ordinary trip.",
    "I always enjoy flying with @airline_name, but today I feel like a VIP ‚Äì all the pilots smiling and the flight attendants are almost too friendly! üòÇ‚úàÔ∏è",
    "Just landed in Paris. No delays, everything went smoothly.",
    "Waiting for my flight to board. The gate area is a bit crowded, but it‚Äôs manageable.",
    "Saw an interesting article about air travel today. Looks like there are a lot of new safety regulations.",
    "Got my boarding pass. Still have some time before the flight, so just relaxing.",
    "The airport was pretty busy today, but it‚Äôs always like that around this time."
]

def analyze_tweets(tweets):
    sentiment_labels = {0: "negative", 1: "neutral", 2: "positive"}

    for tweet in tweets:
        clean_tweet = clean_text(tweet)  # Oczyszczenie
        vectorized_tweet = vectorizer.transform([clean_tweet])  # Przekszta≈Çcenie TF-IDF
        prediction = model.predict(vectorized_tweet)[0]  # Przewidywanie sentymentu
        print(f"Tweet: {tweet}")
        print("üîç Przewidziany sentyment:", sentiment_labels[prediction])
        print("-" * 50)

analyze_tweets(tweets)

Tweet: Once again @airline_name delayed my flight! This is the third time in a row, it‚Äôs clear they can't manage their schedule. It's getting really frustrating! üò§
üîç Przewidziany sentyment: negative
--------------------------------------------------
Tweet: I recently flew with @airline_name and I have to say, the service was amazing! Comfortable seats, great food, and a very friendly crew. I'll definitely fly with them again! ‚úàÔ∏èüòä
üîç Przewidziany sentyment: positive
--------------------------------------------------
Tweet: Flew today with @airline_name. The flight went by without any major issues, but nothing really stood out. Just an ordinary trip.
üîç Przewidziany sentyment: negative
--------------------------------------------------
Tweet: I always enjoy flying with @airline_name, but today I feel like a VIP ‚Äì all the pilots smiling and the flight attendants are almost too friendly! üòÇ‚úàÔ∏è
üîç Przewidziany sentyment: positive
---------------------------------