<a href="https://colab.research.google.com/github/VartanyanAdik/service_by_reviews/blob/main/%D0%9C%D0%BE%D0%B4%D0%B5%D0%BB%D1%8C%20%D1%80%D0%B5%D0%B3%D1%80%D0%B5%D1%81%D0%B8%D0%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Импортируем библиотеки

In [None]:
# Загрузка необходимых библиотек и модулей
import zipfile # Библиотека для работы с zip архивами
import os      # Библиотека для работы с фаловой системой
import time    # Библиотека для работы со временем
from google.colab import drive # Модуль для работы с Google Disk
from PIL import *  # Модуль для работы с файлами
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, mean_squared_error
import re
import nltk
import joblib
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge

# Загрузка данных и их чтение

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Прописываем путь к файлу с архивом
zip_file = '/content/drive/My Drive/Проект от гринатом/aclImdb.zip'

# Распаковываем архив
z = zipfile.ZipFile(zip_file, 'r')
z.extractall()

# Просмотр результата разархивации
print(os.listdir())

['.config', 'aclImdb', 'drive', 'sample_data']


In [None]:
# Загрузка необходимых ресурсов
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Путь к данным
train_path = '../content/aclImdb/train'
test_path = '../content/aclImdb/test'

# Функция для чтения данных из файлов
def read_data(folder, label, base_path):
    data = []
    folder_path = os.path.join(base_path, folder)

    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            # Чтение текста из файла
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                text = file.read()

            # Извлечение target из названия файла
            target = filename.split('_')[1].split('.')[0]

            # Добавление данных в список
            data.append({
                'review': text,
                'label': label,
                'rating': int(target)
            })

    return data


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Чтение данных из папок pos и neg
pos_data = read_data('pos', 1, train_path)
neg_data = read_data('neg', 0, train_path)


pos_data_test = read_data('pos', 1, test_path)
neg_data_test = read_data('neg', 0, test_path)

# Объединение данных
all_data = pos_data + neg_data
test_data = pos_data_test + neg_data_test

# Создание и проверка датафреймов

In [None]:
# Создание DataFrame
df_train = pd.DataFrame(all_data)
df_test = pd.DataFrame(test_data)

In [None]:
df_train

Unnamed: 0,review,label,rating
0,Tressa's vocal performance was Outstanding!! T...,1,10
1,"Well, when before I saw this film I really was...",1,10
2,"I remember this movie from when i was 12, it w...",1,10
3,"This is one of the best reunion specials ever,...",1,10
4,This made for television version of the legend...,1,7
...,...,...,...
24995,"The only reason ""The Norliss Tapes"" deserves A...",0,1
24996,I haven't seen it in over twenty years. OJ was...,0,1
24997,"...was so that I could, in good conscience, te...",0,3
24998,This movie kinda let me down. It seemed a lot ...,0,4


In [None]:
df_test

Unnamed: 0,review,label,rating
0,"I'll admit that I've never seen ""Waiting for G...",1,8
1,Classe Tous Risques (The Big Risk) is repeated...,1,9
2,This movie was the second movie I saw on the c...,1,10
3,"Let's face it, there is no perfect production ...",1,9
4,I recently visited the Magic Kingdom as an adu...,1,10
...,...,...,...
24995,Jeff Speakman never really made it beyond the ...,0,4
24996,What a terrible movie! It represents perfectly...,0,1
24997,I only wish there was a grade lower than F to ...,0,1
24998,"This movie is a real low budget production, ye...",0,3


In [None]:
df_train.shape

(25000, 3)

In [None]:
df_test.shape

(25000, 3)

# Очистка и преобразование данных

In [None]:
# Загрузка стоп-слов
stop_words = set(stopwords.words('english'))

# Инициализация лемматизатора
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    #1. Приведение к нижнему регистру
    text = text.lower()

    # 2. Удаление HTML-тегов
    text = re.sub(r'<[^>]+>', '', text)

    # 3. Удаление пунктуации
    text = re.sub(r'_+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    #text = re.sub(r'\d+', '', text)

    # 4. Токенизация
    tokens = text.split(' ')

    # 5. Удаление стоп-слов
    tokens = [word for word in tokens if word not in stop_words]

    # 6. Лемматизация
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 7. Объединение токенов обратно в строку
    text = ' '.join(tokens)

    return text


In [None]:
df_train.review = df_train.review.apply(preprocess_text)
df_test.review = df_test.review.apply(preprocess_text)

In [None]:
df_train

Unnamed: 0,review,label,rating
0,tressas vocal performance outstanding tressa p...,1,10
1,well saw film really wasnt sure whether would ...,1,10
2,remember movie 12 amazing remember day like th...,1,10
3,one best reunion special ever adam west burt w...,1,10
4,made television version legendary stand hopele...,1,7
...,...,...,...
24995,reason norliss tape deserves star presence ang...,0,1
24996,havent seen twenty year oj bus driver arte joh...,0,1
24997,could good conscience tell everyone horrible m...,0,3
24998,movie kinda let seemed lot like movie jaw hopp...,0,4


In [None]:
df_test

Unnamed: 0,review,label,rating
0,ill admit ive never seen waiting guffman 1997s...,1,8
1,classe tous risques big risk repeatedly recomm...,1,9
2,movie second movie saw cinema child scared liv...,1,10
3,let face perfect production hamlet simply far ...,1,9
4,recently visited magic kingdom adult mom best ...,1,10
...,...,...,...
24995,jeff speakman never really made beyond lowest ...,0,4
24996,terrible movie represents perfectly state dege...,0,1
24997,wish grade lower f give scored 1 vote tallyi g...,0,1
24998,movie real low budget production yet say anyth...,0,3


In [None]:
df_train['label'] = df_train['label'].map({'positive': 1, 'negative': 0})
df_test['label'] = df_test['label'].map({'positive': 1, 'negative': 0})

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train['review'])
print(vectorizer.get_feature_names_out()[:10])
print(X_train.shape)

['00' '000' '0000000000001' '000001' '0001' '00015' '001' '0010' '002'
 '00383042']
(25000, 133293)


In [None]:
X_test = vectorizer.transform(df_test.review)
print(X_test.shape)

(25000, 133293)


# Выбор лучшей модели

In [None]:
def test_error(model, model_name, X_test, y_test, X_train, y_train, metric):
    model.fit(X_train, y_train)
    print(f'{model_name}: метрика на train данных: {metric(model.predict(X_train), y_train):.4f}')
    test_pred = model.predict(X_test)
    f = lambda x: round(x)
    rounded_pred = list(map(f, test_pred))
    print(f'{model_name}: метрика на тестовых данных: {metric(test_pred, y_test):.4f}')
    print()

In [None]:
# Словарь моделей регрессии
models_regression = {
    'Linear Regression': LinearRegression(),

    'Lasso Regression': Lasso()
}

for a in np.linspace(0.5, 5.5, 21):
    models_regression[f'Ridge alpha={a}'] = Ridge(a)


In [None]:
models_regression

{'Linear Regression': LinearRegression(),
 'Lasso Regression': Lasso(),
 'Ridge alpha=0.5': Ridge(alpha=0.5),
 'Ridge alpha=0.75': Ridge(alpha=0.75),
 'Ridge alpha=1.0': Ridge(),
 'Ridge alpha=1.25': Ridge(alpha=1.25),
 'Ridge alpha=1.5': Ridge(alpha=1.5),
 'Ridge alpha=1.75': Ridge(alpha=1.75),
 'Ridge alpha=2.0': Ridge(alpha=2.0),
 'Ridge alpha=2.25': Ridge(alpha=2.25),
 'Ridge alpha=2.5': Ridge(alpha=2.5),
 'Ridge alpha=2.75': Ridge(alpha=2.75),
 'Ridge alpha=3.0': Ridge(alpha=3.0),
 'Ridge alpha=3.25': Ridge(alpha=3.25),
 'Ridge alpha=3.5': Ridge(alpha=3.5),
 'Ridge alpha=3.75': Ridge(alpha=3.75),
 'Ridge alpha=4.0': Ridge(alpha=4.0),
 'Ridge alpha=4.25': Ridge(alpha=4.25),
 'Ridge alpha=4.5': Ridge(alpha=4.5),
 'Ridge alpha=4.75': Ridge(alpha=4.75),
 'Ridge alpha=5.0': Ridge(alpha=5.0),
 'Ridge alpha=5.25': Ridge(alpha=5.25),
 'Ridge alpha=5.5': Ridge(alpha=5.5)}

In [None]:
X_train_neg = X_train[12500:]
X_train_pos = X_train[:12500]
y_train_neg = y_train[12500:]
y_train_pos = y_train[:12500]


X_test_neg = X_test[12500:]  # Теперь это тестовые данные для негативных примеров
X_test_pos = X_test[:12500]   # Теперь это тестовые данные для позитивных примеров
y_test_neg = y_test[12500:]   # Тестовые метки для негативных примеров
y_test_pos = y_test[:12500]    # Тестовые метки для позитивных примеров


In [None]:
print('MAE на негативных отзывах:')
print()
for model_name, model in models_regression.items():
    test_error(model, model_name, X_test_neg, y_test_neg, X_train_neg, y_train_neg, mean_absolute_error)

MAE на негативных отзывах:

Linear Regression: метрика на train данных: 0.0001
Linear Regression: метрика на тестовых данных: 0.9687

Lasso Regression: метрика на train данных: 1.0723
Lasso Regression: метрика на тестовых данных: 1.0639

Ridge alpha=0.5: метрика на train данных: 0.4080
Ridge alpha=0.5: метрика на тестовых данных: 0.8504

Ridge alpha=0.75: метрика на train данных: 0.4746
Ridge alpha=0.75: метрика на тестовых данных: 0.8422

Ridge alpha=1.0: метрика на train данных: 0.5207
Ridge alpha=1.0: метрика на тестовых данных: 0.8383

Ridge alpha=1.25: метрика на train данных: 0.5552
Ridge alpha=1.25: метрика на тестовых данных: 0.8365

Ridge alpha=1.5: метрика на train данных: 0.5826
Ridge alpha=1.5: метрика на тестовых данных: 0.8358

Ridge alpha=1.75: метрика на train данных: 0.6051
Ridge alpha=1.75: метрика на тестовых данных: 0.8358

Ridge alpha=2.0: метрика на train данных: 0.6240
Ridge alpha=2.0: метрика на тестовых данных: 0.8362

Ridge alpha=2.25: метрика на train данных:

In [None]:
print('MAE на позитивных отзывах:')
print()
for model_name, model in models_regression.items():
    test_error(model, model_name, X_test_pos, y_test_pos, X_train_pos, y_train_pos, mean_absolute_error)

MAE на позитивных отзывах:

Linear Regression: метрика на train данных: 0.0000
Linear Regression: метрика на тестовых данных: 1.0271

Lasso Regression: метрика на train данных: 1.0498
Lasso Regression: метрика на тестовых данных: 1.0428

Ridge alpha=0.5: метрика на train данных: 0.4234
Ridge alpha=0.5: метрика на тестовых данных: 0.8796

Ridge alpha=0.75: метрика на train данных: 0.4902
Ridge alpha=0.75: метрика на тестовых данных: 0.8691

Ridge alpha=1.0: метрика на train данных: 0.5362
Ridge alpha=1.0: метрика на тестовых данных: 0.8640

Ridge alpha=1.25: метрика на train данных: 0.5707
Ridge alpha=1.25: метрика на тестовых данных: 0.8615

Ridge alpha=1.5: метрика на train данных: 0.5980
Ridge alpha=1.5: метрика на тестовых данных: 0.8603

Ridge alpha=1.75: метрика на train данных: 0.6203
Ridge alpha=1.75: метрика на тестовых данных: 0.8599

Ridge alpha=2.0: метрика на train данных: 0.6390
Ridge alpha=2.0: метрика на тестовых данных: 0.8599

Ridge alpha=2.25: метрика на train данных:

In [None]:
print('MSE на негативных отзывах:')
print()
for model_name, model in models_regression.items():
    test_error(model, model_name, X_test_neg, y_test_neg, X_train_neg, y_train_neg, mean_squared_error)

MSE на негативных отзывах:

Linear Regression: метрика на train данных: 0.0000
Linear Regression: метрика на тестовых данных: 1.4575

Lasso Regression: метрика на train данных: 1.4172
Lasso Regression: метрика на тестовых данных: 1.3985

Ridge alpha=0.5: метрика на train данных: 0.2470
Ridge alpha=0.5: метрика на тестовых данных: 1.0641

Ridge alpha=0.75: метрика на train данных: 0.3290
Ridge alpha=0.75: метрика на тестовых данных: 1.0329

Ridge alpha=1.0: метрика на train данных: 0.3918
Ridge alpha=1.0: метрика на тестовых данных: 1.0154

Ridge alpha=1.25: метрика на train данных: 0.4421
Ridge alpha=1.25: метрика на тестовых данных: 1.0045

Ridge alpha=1.5: метрика на train данных: 0.4837
Ridge alpha=1.5: метрика на тестовых данных: 0.9975

Ridge alpha=1.75: метрика на train данных: 0.5189
Ridge alpha=1.75: метрика на тестовых данных: 0.9929

Ridge alpha=2.0: метрика на train данных: 0.5492
Ridge alpha=2.0: метрика на тестовых данных: 0.9899

Ridge alpha=2.25: метрика на train данных:

In [None]:
print('MSE на позитивных отзывах:')
print()
for model_name, model in models_regression.items():
    test_error(model, model_name, X_test_pos, y_test_pos, X_train_pos, y_train_pos,  mean_squared_error)

MSE на позитивных отзывах:

Linear Regression: метрика на train данных: 0.0000
Linear Regression: метрика на тестовых данных: 1.6114

Lasso Regression: метрика на train данных: 1.3496
Lasso Regression: метрика на тестовых данных: 1.3314

Ridge alpha=0.5: метрика на train данных: 0.2645
Ridge alpha=0.5: метрика на тестовых данных: 1.1213

Ridge alpha=0.75: метрика на train данных: 0.3484
Ridge alpha=0.75: метрика на тестовых данных: 1.0827

Ridge alpha=1.0: метрика на train данных: 0.4121
Ridge alpha=1.0: метрика на тестовых данных: 1.0609

Ridge alpha=1.25: метрика на train данных: 0.4626
Ridge alpha=1.25: метрика на тестовых данных: 1.0472

Ridge alpha=1.5: метрика на train данных: 0.5042
Ridge alpha=1.5: метрика на тестовых данных: 1.0382

Ridge alpha=1.75: метрика на train данных: 0.5392
Ridge alpha=1.75: метрика на тестовых данных: 1.0320

Ridge alpha=2.0: метрика на train данных: 0.5693
Ridge alpha=2.0: метрика на тестовых данных: 1.0277

Ridge alpha=2.25: метрика на train данных:

# Моделирование

In [None]:

train_neg_pipeline_review = df_train[12500:]
train_pos_pipeline_review = df_train[:12500]
train_neg_pipeline_rating = df_train[12500:]
train_pos_pipeline_rating = df_train[:12500]


test_neg_pipeline_review = df_test[12500:]  # Теперь это тестовые данные для негативных примеров
test_pos_pipeline_review = df_test[:12500]   # Теперь это тестовые данные для позитивных примеров
test_neg_pipeline_rating = df_test[12500:]   # Тестовые метки для негативных примеров
test_pos_pipeline_rating = df_test[:12500]

In [None]:
X_train_negative = train_neg_pipeline_review['review']
X_train_positive = train_pos_pipeline_review['review']
y_train_negative = train_neg_pipeline_rating['rating']
y_train_positive = train_pos_pipeline_rating['rating']

X_test_negative =  test_neg_pipeline_review['review']
X_test_positive =  test_pos_pipeline_review['review']
y_test_negative =  test_neg_pipeline_rating['rating']
y_test_positive =  test_pos_pipeline_rating['rating']

## Предсказание негативных оценок

In [None]:
# Создаем пайплайн с векторизацией и логистической регрессией
pipeline_ridge_neg = make_pipeline(TfidfVectorizer(), Ridge(alpha=1.75))

In [None]:
# Обучаем модель
pipeline_ridge_neg.fit(X_train_negative, y_train_negative)

In [None]:
# Тестируем модель
y_pred_neg = pipeline_ridge_neg.predict(X_test_negative)
print(mean_squared_error(y_test_negative, y_pred_neg))

0.997197638625999


In [None]:
print(mean_absolute_error(y_test_negative, y_pred_neg))

0.8372751686646785


In [None]:
joblib.dump(pipeline_ridge_neg, 'pipeline_ridge_neg.pkl')

['pipeline_ridge_neg.pkl']

## Предсказание положительных оценок

In [None]:
# Создаем пайплайн с векторизацией и логистической регрессией
pipeline_ridge_pos = make_pipeline(TfidfVectorizer(), Ridge(alpha=1.75))

In [None]:
# Обучаем модель
pipeline_ridge_pos.fit(X_train_positive, y_train_positive)

In [None]:
# Тестируем модель
y_pred_pos = pipeline_ridge_pos.predict(X_test_positive)
print(mean_squared_error(y_test_positive, y_pred_pos))

1.0334030682471673


In [None]:
print(mean_absolute_error(y_test_positive, y_pred_pos))

0.8597678306775807


In [None]:
joblib.dump(pipeline_ridge_pos, 'pipeline_ridge_pos.pkl')

['pipeline_ridge_pos.pkl']