In [None]:
import csv
import re
import zipfile
from io import BytesIO
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict
import pandas as pd

In [None]:
# Read training data
train_domains = list()
y_train = list()
with open("train.txt", 'r') as f:
    for line in f:
        l = line.split(',')
        train_domains.append(l[0])
        y_train.append(l[1][:-1])

In [None]:
# Read test data
test_domains = list()
with open("test.txt", 'r') as f:
    for line in f:
        l = line.split(',')
        test_domains.append(l[0])

In [None]:
# Read textual content of webpages of domain names
text = dict()
with zipfile.ZipFile('domains.zip', "r") as zfile:
    for filename in zfile.namelist():
        if re.search(r'\.zip$', filename) is not None:
            zfiledata = BytesIO(zfile.read(filename))
            with zipfile.ZipFile(zfiledata) as zfile2:
                text[filename[:-4]] = ''
                for name2 in zfile2.namelist():
                    file = zfile2.read(name2)
                    text[filename[:-4]] += file.decode('utf16') + ' '

In [None]:
# Retrieve textual content of domain names of the training set
train_data = list()
for domain in train_domains:
    if domain in text:
        train_data.append(text[domain])
    else:
        train_data.append('')


In [None]:
# Retrieve textual content of domain names of the test set
test_data = list()
for domain in test_domains:
    if domain in text:
        test_data.append(text[domain])
    else:
        test_data.append('')

In [None]:
text = None

In [None]:
train_data[0]

'http://www.autocarnet.gr/2012/12/2_23.html *#*Συμπληρώστε το ερωτηματολόγιο για την αξιοπιστία του αυτοκινήτου σας\nΕιδήσεις\nΠαραδόθηκε στην κυκλοφορία το τμήμα Παραδείσια Τσακώνα του αυτοκινητοδρόμου Κορίνθος Καλαμάτα\nΤέλος μπαίνει στην ταλαιπωρία των οδηγών μετακινούνταν από και προς την Καλαμάτα μέσω Παραδεισίων μετά την παράδοση στην κυκλοφορία του τμήματος της εθνικής οδού Τρίπολης Καλαμάτας στην περιοχή της Τσακώνας το οποίο παρέμεινε κλειστό εδώ και χρόνια λόγω έργων\nΘα υπάρχει μόνο μία μικρή\nπαράκαμψη στο σημείο κατασκευής της μεγάλης γέφυρας όπου βυθιζόταν ο δρόμος Ο χρόνος της διαδρομής Αθήνα Καλαμάτα χλμ θα μειωθεί περίπου κατά τριάντα λεπτά στις ώρες από ώρες και λεπτά που είναι σήμερα\nΜε την παράδοση του τμήματος Παραδείσια Τσακώνα αποδίδονται σε πλήρη κυκλοφορία και οι Ανισόπεδοι Κόμβοι Παραδεισίων και Κυπαρισσίας καθώς και το τμήμα από την Τσακώνα μέχρι τον Ανισόπεδο Κόμβο Κυπαρισσίας που κατασκευάστηκαν στο πλαίσιο της Σύμβασης Παραχώρησης του αυτοκινητοδρόμου Κόρ

In [None]:
test_data[0]

'http://startupper.gr/500-%ce%b5%ce%ba%ce%b1%cf%84-%ce%b5%cf%85%cf%81%cf%89-%ce%b3%ce%b9%ce%b1-startups-%ce%b1%cf%80%ce%bf-%ce%b5%ce%b5/ *#*Είστε στο εκατ ευρώ από την ΕΕ για χρηματοδότηση και ΜΜΕ\n εκατ ευρώ από την ΕΕ για χρηματοδότηση και ΜΜΕ\n \nΧρηματοδότηση ύψους εκατ ευρώ μέσω διαφόρων προγραμμάτων ενέκρινε η Ευρωπαϊκή Επιτροπή για και μικρομεσαίες επιχειρήσεις για την Ελλάδα για το τρέχον έτος Τα χρήματα αυτά θα διοχετευθούν στην αγορά μέσω τραπεζών και άλλων χρηματοπιστωτικών και επενδυτικών εταιρειών \nΤο σχέδιο ονομάζεται και πρόκειται για ένα ευέλικτο σχήμα εργαλείων χρηματοδότησης τα οποία περιλαμβάνουν δάνεια μικρο χρηματοδοτήσεις εγγυήσεις και επιχειρηματικά κεφάλαια Κάθε χρόνο πάνω από επιχειρήσεις επωφελούνται από αυτή τη χρηματοδότηση στην Ε Ε \nΓια την Ελλάδα έχουν προβλεφθεί εκατ ευρώ από το Ευρωπαϊκό Ταμείο Επενδύσεων μέσω διαφόρων προγραμμάτων όπως προαναφέραμε για τις επιχειρήσεις και άλλα εκατ ευρώ από την Ευρωπαϊκή Τράπεζα Επενδύσεων τα οποία προορίζονται κυρίω

## Text pre-processing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('greek'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuations and numbers
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))

    # Tokenize text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    # Join tokens back into text
    text = ' '.join(tokens)

    return text




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Drogias\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Drogias\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Drogias\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
train_data_preprocessed = [preprocess_text(text) for text in train_data]
test_data_preprocessed = [preprocess_text(text) for text in test_data]

In [None]:
vec = TfidfVectorizer(ngram_range=(1,3), max_features = 50000, decode_error='ignore', strip_accents='unicode', min_df=1, max_df=0.95)
X_train = vec.fit_transform(train_data_preprocessed)

# Create the test matrix following the same approach as in the case of the training matrix
X_test = vec.transform(test_data_preprocessed)

print("Train matrix dimensionality: ", X_train.shape)
print("Test matrix dimensionality: ", X_test.shape)

Train matrix dimensionality:  (1812, 50000)
Test matrix dimensionality:  (605, 50000)


## Reducing dimensionality using SVD (keeping 5000 features)

In [None]:
from sklearn.decomposition import TruncatedSVD


svd = TruncatedSVD(n_components=5000, random_state=4321)

X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

print("Shape (training data) after SVD: ", X_train_svd.shape)
print("Shape (test data) after SVD: ", X_test_svd.shape)

Shape (training data) after SVD:  (1812, 1812)
Shape (test data) after SVD:  (605, 1812)


In [None]:
!pip install imbalanced-learn

In [None]:
print('--Class distribution before oversampling--')
print(pd.Series(y_train).value_counts())

--Class distribution before oversampling--
3    548
5    280
2    263
8    189
1    140
4    113
7     99
6     98
0     82
Name: count, dtype: int64


In [None]:
'''
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to generate synthetic samples
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the class distribution after resampling
import pandas as pd
print("--Class distribution after oversampling--")
print(pd.Series(y_train_resampled).value_counts())
'''

In [None]:
'''
from sklearn.model_selection import train_test_split

# Split the resampled training data into training and validation sets
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train_resampled, y_train_resampled, test_size=0.2, random_state=42)
'''

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
'''

clf = LogisticRegression(max_iter = 1000)

clf.fit(X_train_resampled, y_train_resampled)
y_pred = clf.predict_proba(X_test)
y_val_pred = clf.predict(X_val)
y_train_pred = clf.predict(X_train_resampled)

print("Classification Report:")
print(classification_report(y_train_resampled, y_train_pred))
print(classification_report(y_val, y_val_pred))
'''

In [None]:
'''
from sklearn.utils.class_weight import compute_class_weight

class_labels = np.unique(y_train)
class_weights = compute_class_weight(class_weight ='balanced', classes= class_labels, y = y_train)


class_weight_dict = dict(zip(class_labels, class_weights))


clf = LogisticRegression(class_weight=class_weight_dict, max_iter=1000)



# Perform 5-fold cross-validation with predict_proba
#y_preds_proba = cross_val_predict(clf, X_train, y_train, cv=5, method='predict_proba')

clf.fit(X_train_svd, y_train)
y_pred = clf.predict_proba(X_test_svd)
y_pred_train = clf.predict_proba(X_train_svd)

'''

clf = LogisticRegression(max_iter = 1000)

clf.fit(X_train_svd, y_train)
y_pred = clf.predict_proba(X_test_svd)
y_pred_train = clf.predict_proba(X_train_svd)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest Classifier
rf_clf = RandomForestClassifier(random_state=42)

# Fit the model
rf_clf.fit(X_train_svd, y_train)

# Predict on the training set
y_pred_rf = rf_clf.predict_proba(X_test_svd)
y_pred_train_rf = rf_clf.predict_proba(X_train_svd)


In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Initialize AdaBoost Classifier
ada_clf = AdaBoostClassifier(random_state=42)

# Fit the model
ada_clf.fit(X_train_svd, y_train)

# Predict on the training set
y_pred_ada = ada_clf.predict_proba(X_test_svd)
y_pred_ada_train = ada_clf.predict_proba(X_train_svd)

In [None]:
from sklearn.neural_network import MLPClassifier

# Initialize MLP Classifier
mlp_clf = MLPClassifier(random_state=42, max_iter=1000)

# Fit the model
mlp_clf.fit(X_train_svd, y_train)

# Predict on the training set
y_pred_mlp = mlp_clf.predict_proba(X_test_svd)
y_pred_mlp_train = mlp_clf.predict_proba(X_train_svd)

In [None]:
y_pred_mlp

array([[3.50010655e-03, 6.33700662e-03, 1.09493001e-02, ...,
        4.98716547e-03, 5.25768143e-03, 9.49435725e-03],
       [5.93604503e-02, 6.67337296e-02, 7.06680897e-02, ...,
        1.26844774e-01, 6.78923330e-02, 1.70488117e-01],
       [4.92968660e-03, 8.19480975e-03, 5.90731680e-01, ...,
        1.35298671e-02, 2.21897593e-03, 8.47452440e-02],
       ...,
       [2.42182622e-02, 3.35224360e-02, 5.59523660e-02, ...,
        2.20434519e-01, 1.11464012e-01, 2.10695555e-02],
       [5.24480424e-04, 2.05669389e-02, 5.62165093e-04, ...,
        4.16252861e-04, 1.91348559e-03, 1.49351531e-03],
       [9.68179220e-04, 1.26986744e-02, 1.62412213e-02, ...,
        1.95843437e-03, 1.37963373e-03, 3.07606212e-03]])

In [None]:
y_pred_ada

In [None]:
#Random Forest probs
y_pred_rf

In [None]:
y_preds_proba

In [None]:
y_pred

In [None]:
y_pred

In [None]:
y_pred_train

In [None]:
from sklearn.metrics import log_loss

# Compute cross-entropy loss
cross_entropy_loss = log_loss(y_train, y_pred_train, labels=clf.classes_)
cross_entropy_loss

1.129101523277945

In [None]:
cross_entropy_loss = log_loss(y_train, y_pred_ada_train, labels=ada_clf.classes_)
cross_entropy_loss

2.0878526919321616

In [None]:
cross_entropy_loss = log_loss(y_train, y_pred_train_rf, labels=rf_clf.classes_)
cross_entropy_loss

0.6114254606650923

In [None]:
cross_entropy_loss = log_loss(y_train, y_pred_mlp_train, labels=mlp_clf.classes_)
cross_entropy_loss

0.3409724364978738

In [None]:
# Write predictions to a file
with open('sample_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(9):
        lst.append('class_'+str(i))
    lst.insert(0, "domain_name")
    writer.writerow(lst)
    for i,test_host in enumerate(test_domains):
        lst = y_pred_mlp[i,:].tolist()
        lst.insert(0, test_host)
        writer.writerow(lst)

In [None]:
### PROSOXI: NA VAZO y_preds_proba H y_pred analoga!!!