In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Домашняя работа

Работаем с отзывами на авиа-компанию:

https://www.kaggle.com/datasets/kanchana1990/singapore-airlines-reviews/data


## Easy

Выбрать метрику.

Удалить пунктуацию из датасета.

Преобразовать датасет в BOW или TF-IDF, поделить на треин-тест и спрогнозировать вашей любимой моделью

## Normal

Удалить стоп-слова, применить стемминг/лемматизацию. Обучить несколько моделей, подобрать параметры, сделать выводы.

Написать самостоятельно два отзыва - один положительный, один негативный, посмотреть, угадает ли модель.

## Hard

Исследовать какие слова полученный алгоритм воспринимает как резко-негативные и резко-позитивные. Научиться систематически обманывать классификатор. Написать 5 положительных и 5 негативных отзывов, где модель будет ошибаться. Важен алгоритм по которому вы составляете эти отзывы, а не сами отзывы.

## Easy

In [3]:
data = pd.read_csv('../data/singapore_airlines_reviews.csv')
data.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,Desktop,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [4]:
data['published_platform'].unique()

array(['Desktop', 'Mobile'], dtype=object)

In [5]:
mapping = {'Desktop': 0, 'Mobile': 1}
data['published_platform'] = data['published_platform'].map(mapping)
data.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,0,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,0,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,0,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,0,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,0,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [6]:
data['rating'] = data['rating'].apply(lambda x: 0 if x in [1, 2, 3] else 1)
data.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,0,0,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,0,1,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,0,0,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,0,1,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,0,0,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [7]:
data.isna().sum()

published_date        0
published_platform    0
rating                0
type                  0
text                  0
title                 0
helpful_votes         0
dtype: int64

In [8]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text).lower()
    return text

In [9]:
data['title_text'] = data['title'] + ' ' + data['text']
data['title_text'] = data['title_text'].apply(preprocess_text)

In [10]:
data.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes,title_text
0,2024-03-12T14:41:14-04:00,0,0,review,We used this airline to go from Singapore to L...,Ok,0,ok we used this airline to go from singapore t...
1,2024-03-11T19:39:13-04:00,0,1,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0,the service in suites class makes one feel lik...
2,2024-03-11T12:20:23-04:00,0,0,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0,dont give them your money booked paid and rece...
3,2024-03-11T07:12:27-04:00,0,1,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0,best airline in the world best airline in the ...
4,2024-03-10T05:34:18-04:00,0,0,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0,premium economy seating on singapore airlines ...


In [11]:
X_train, X_test, y_train, y_test = train_test_split(data['title_text'], 
                                                    data['rating'], test_size=0.15, random_state=42)

In [12]:
X_train.shape

(8500,)

In [13]:
X_test.shape

(1500,)

In [14]:
bow = CountVectorizer()
X_train = bow.fit_transform(X_train).toarray()
X_test = bow.transform(X_test).toarray()

In [15]:
X_train.shape

(8500, 21914)

In [16]:
X_test.shape

(1500, 21914)

In [17]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
preds = lr.predict(X_test)

In [18]:
f1_score(y_test, preds)

0.9370185772541911

## Medium

In [19]:
!pip install nltk



In [20]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

In [21]:
nltk.download('stopwords', quiet=True)

True

In [22]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [23]:
def stemmer_deleter(text):
    text = re.sub(r'[^\w\s]', '', text).lower()
    stemmed_words = [stemmer.stem(word) for word in text.split()]
    word_tokens = word_tokenize(' '.join(stemmed_words))
    filtered_text = [word for word in word_tokens if word not in stop_words]

    return ' '.join(filtered_text)

In [24]:
data['title_text'] = data['title_text'].apply(stemmer_deleter)
data.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes,title_text
0,2024-03-12T14:41:14-04:00,0,0,review,We used this airline to go from Singapore to L...,Ok,0,ok use thi airlin go singapor london heathrow ...
1,2024-03-11T19:39:13-04:00,0,1,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0,servic suit class make one feel like vip servi...
2,2024-03-11T12:20:23-04:00,0,0,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0,dont give money book paid receiv email confirm...
3,2024-03-11T07:12:27-04:00,0,1,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0,best airlin world best airlin world seat food ...
4,2024-03-10T05:34:18-04:00,0,0,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0,premium economi seat singapor airlin worth mon...


In [25]:
X_train, X_test, y_train, y_test = train_test_split(data['title_text'], 
                                                    data['rating'], test_size=0.15, random_state=42)

In [26]:
vectorizers = {
    'CountVectorizer': CountVectorizer(),
    'TfidfVectorizer': TfidfVectorizer()
}

models = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'CatBoostClassifier': CatBoostClassifier(metric_period=200, random_state=42)
}

In [27]:
results = []

for vec_name, vectorizer in vectorizers.items():
    for model_name, model in models.items():
        pipeline = Pipeline(
            [
                ('vectorizer', vectorizer),
                ('model', model),
            ]
        )
        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)
        results.append(
            {
                'vectorizer': vec_name,
                'model': model_name,
                'f1': f1_score(preds, y_test),
            }
        )

Learning rate set to 0.025692
0:	learn: 0.6798783	total: 88.1ms	remaining: 1m 28s
200:	learn: 0.2804881	total: 5.31s	remaining: 21.1s
400:	learn: 0.2288865	total: 11.1s	remaining: 16.6s
600:	learn: 0.2033380	total: 16.9s	remaining: 11.2s
800:	learn: 0.1839222	total: 22.1s	remaining: 5.49s
999:	learn: 0.1696785	total: 26.9s	remaining: 0us
Learning rate set to 0.025692
0:	learn: 0.6787879	total: 69ms	remaining: 1m 8s
200:	learn: 0.2732573	total: 9.92s	remaining: 39.4s
400:	learn: 0.2232029	total: 19.7s	remaining: 29.5s
600:	learn: 0.1908230	total: 30.6s	remaining: 20.3s
800:	learn: 0.1697282	total: 41.2s	remaining: 10.2s
999:	learn: 0.1536477	total: 52.8s	remaining: 0us


In [28]:
best_result = max(results, key=lambda x: x['f1'])
best_result

{'vectorizer': 'TfidfVectorizer',
 'model': 'LogisticRegression',
 'f1': 0.9391771019677997}

In [29]:
params = {
    'vectorizer__max_features': [1000, 5000, 10000],
    'model__C': [0.1, 1, 10],
}

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('model', LogisticRegression(random_state=42)),
])

grid_search = GridSearchCV(pipeline, params, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                       ('model',
                                        LogisticRegression(random_state=42))]),
             param_grid={'model__C': [0.1, 1, 10],
                         'vectorizer__max_features': [1000, 5000, 10000]},
             scoring='f1')

In [30]:
print('Best params: ', grid_search.best_params_)
print('Best f1 score: ', grid_search.best_score_)

Best params:  {'model__C': 1, 'vectorizer__max_features': 5000}
Best f1 score:  0.942441391585876


In [31]:
best_model = grid_search.best_estimator_
best_model

Pipeline(steps=[('vectorizer', TfidfVectorizer(max_features=5000)),
                ('model', LogisticRegression(C=1, random_state=42))])

In [32]:
# positive
pos = 'This flight was very comfortable, thank you to the company staff'
preds = best_model.predict([stemmer_deleter(pos)])
preds[0]

1

In [33]:
# negative
neg = 'It was the worst flight in my life, I do not recommend this company to anyone/'
preds = best_model.predict([stemmer_deleter(neg)])
preds[0]

0