# <div align="center">AUA-ST Task 2 Sergo Poghosyan<div align="center">   

### Importing necessary libraries 

In [1]:
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

### Loading necessary files and converting them into data frames

In [2]:
train_df = pd.read_csv('train.csv')
validation_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')
clusters_df = pd.read_csv('dialogsum_clustered.csv')

### Making sure we don't have any missing values

In [3]:
test_df.isnull().sum()
#train_df.isnull().sum()
#validation_df.isnull().sum()
#clusters_df.isnull().sum()

id          0
dialogue    0
summary     0
topic       0
dtype: int64

### Merging train, test and validation with clusters

In [4]:
train_df = train_df.merge(clusters_df, on='id', how='left')
validation_df = validation_df.merge(clusters_df, on='id', how='left')
test_df = test_df.merge(clusters_df, on='id', how='left')

### Making sure we use only the first ones in test set

In [5]:
test_df = test_df[test_df['id'].str.endswith('_1')]

### Feature extraction using TF-IDF with bigrams

In [6]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(train_df['dialogue'] + " " + train_df['summary'])
y_train = train_df['cluster']

X_validation = vectorizer.transform(validation_df['dialogue'] + " " + validation_df['summary'])
y_validation = validation_df['cluster']

X_test = vectorizer.transform(test_df['dialogue'] + " " + test_df['summary'])
y_test = test_df['cluster']

### SGD classifier with adjusted regularization and cross-validation

In [7]:
model = SGDClassifier(loss='log', class_weight='balanced', random_state=42, max_iter=1000, tol=1e-4, alpha=1e-5)

start_time = time.time()
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')
print(f'Cross-validation time: {time.time() - start_time:.2f} seconds')
print(f'Cross-validated F1 Scores: {cv_scores}')
print(f'Mean Cross-validated F1 Score: {cv_scores.mean()}\n')

start_time = time.time()
model.fit(X_train, y_train)
print(f'Model training time: {time.time() - start_time:.2f} seconds\n')

start_time = time.time()
y_val_pred = model.predict(X_validation)
val_weighted_f1 = f1_score(y_validation, y_val_pred, average='weighted')
print(f'Evaluation on validation set time: {time.time() - start_time:.2f} seconds')
print(f'Weighted Average F1 Score on Validation Set for SGD Classifier: {val_weighted_f1:.4f}\n')

start_time = time.time()
y_test_pred = model.predict(X_test)
test_weighted_f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f'Evaluation on test set time: {time.time() - start_time:.2f} seconds')
print(f'Weighted Average F1 Score on Test Set for SGD Classifier: {test_weighted_f1:.4f}')

Cross-validation time: 15.30 seconds
Cross-validated F1 Scores: [0.36348475 0.35399354 0.37385002 0.36417958 0.37136691]
Mean Cross-validated F1 Score: 0.3653749597326086

Model training time: 4.45 seconds

Evaluation on validation set time: 0.00 seconds
Weighted Average F1 Score on Validation Set for SGD Classifier: 0.3726

Evaluation on test set time: 0.00 seconds
Weighted Average F1 Score on Test Set for SGD Classifier: 0.3956
