In [1]:
#Understanding the data set
import pandas as pd
import numpy as np

def inspect_x_test(file_path):
    try:
        df = pd.read_csv(file_path)
        print("Read with commas")
    except pd.errors.ParserError:
        try:
            df = pd.read_csv(file_path, delimiter='\t')
            print("Read with tabs")
        except Exception as e:
            print(f"Error reading the .txt file: {e}")
            return
    except Exception as e:
        print(f"Error reading the file: {e}")
        return

    print("Data Preview:")
    print(f"Number of rows (samples): {df.shape[0]}")
    print(f"Number of columns (features): {df.shape[1]}")
    print("\nColumn names:")
    print(df.columns.tolist())

# inspect_x_test("Data/x_test.txt")
# inspect_x_test("Data/y_test.txt")


def preview_large_csv(file_path, num_lines=20, delimiter=','):
    try:
        df = pd.read_csv(file_path, delimiter=delimiter, nrows=num_lines)
        print(df)
    except Exception as e:
        print(f"Error loading preview: {e}")

# preview_large_csv("Data/x_test.txt", delimiter='\t')

def check_nulls_in_data(x_file_path, y_file_path):
    def read_file_smart(path, label):
        try:
            df = pd.read_csv(path)
            print(f"{label}: Read with commas")
        except pd.errors.ParserError:
            try:
                df = pd.read_csv(path, delimiter='\t')
                print(f"{label}: Read with tabs")
            except Exception as e:
                print(f"Error reading the {label} file: {e}")
                return None
        except Exception as e:
            print(f"Error reading the {label} file: {e}")
            return None
        return df

    X = read_file_smart(x_file_path, "X")
    y = read_file_smart(y_file_path, "y")

    if X is None or y is None:
        return

    if X.isnull().values.any() or y.isnull().values.any():
        print("Null found")
    else:
        print("No nulls found")

check_nulls_in_data('Data/x_test.txt', 'Data/y_test.txt')
check_nulls_in_data('Data/x_train.txt', 'Data/y_train.txt')



X: Read with tabs
y: Read with commas
Null found
X: Read with tabs
y: Read with commas
Null found


In [None]:
#Function to preprocess the data
def preprocessXandY(X, y):
    lang_codes = ['ita', 'fra', 'eng', 'ind', 'spa'] #lang codes of what we will use Italian, French, English, Indonesian, Spanish
    return None



In [33]:
#Understanding the data set
import pandas as pd
import numpy as np

def file_to_np_array(path, label):
    try:
        df = pd.read_csv(path, sep='<NonExistenceSeparator>', header=None, engine='python')
        print(f"{label}: Read each line into row")
    except Exception as e:
        print(f"Error reading the {label} file: {e}")
        return None
    return df.to_numpy()


def clean_np_data(X, y):
    stacked = np.hstack((y, X)) # Stack y and X side by side
    print(stacked.shape)
    clean_stacked = stacked[~np.any(pd.isna(stacked), axis=1), :] # Remove empty values
    print(clean_stacked.shape)
    lang_codes = ['ita', 'fra', 'eng', 'ind', 'spa', 'lat', 'ron', 'por', 'pol', 'swe', 'vie', 'war', 'rup','nld', 
                  'deu', 'ces', 'aze', 'cat', 'ceb', 'fin', 'hau', 'ibo', 'jbo', 'kin', 'kur', 'lug', 'nob', 'orm', 'ton'
                  ,'tsn', 'xho']
    true_clean = clean_stacked[np.isin(clean_stacked[:,0], lang_codes),:] # Remove all rows that aren't our target languages
    print(true_clean.shape)
    return true_clean[:,1], true_clean[:,0] # Return cleaned as X and y split again

def clean_filter_and_stack(X1_file, y1_file, X2_file, y2_file):
    X1_clean, y1_clean = clean_np_data(file_to_np_array(X1_file, X1_file), 
                                       file_to_np_array(y1_file, y1_file))
    X2_clean, y2_clean = clean_np_data(file_to_np_array(X2_file, X2_file), 
                                       file_to_np_array(y2_file, y2_file))
    # return np.hstack((X1_clean, X2_clean)), np.hstack((y1_clean, y2_clean))
    return X1_clean, y1_clean, X2_clean, y2_clean

X_train, y_train, X_test, y_test = clean_filter_and_stack("Data/x_train.txt", 
                                      "Data/y_train.txt", 
                                      "Data/x_test.txt", 
                                      "Data/y_test.txt")
    

Data/x_train.txt: Read each line into row
Data/y_train.txt: Read each line into row
(117500, 2)
(117000, 2)
(15500, 2)
Data/x_test.txt: Read each line into row
Data/y_test.txt: Read each line into row
(117500, 2)
(117000, 2)
(15500, 2)


In [34]:
from sklearn.model_selection import train_test_split
# X_tr, X_te, y_tr, y_te = train_test_split(X_all, y_all, test_size=0.3, random_state=17)
# print(X_tr.shape, y_tr.shape)
# print(X_te.shape, y_te.shape)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(strip_accents='unicode')
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)
print("Done vectorizing")

Done vectorizing


In [36]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vectors, y_train)
print("Done training MNB")

Done training MNB


In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test_vectors)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(np.unique(y_pred))
print(X_test[0:10])

Accuracy:  0.9712258064516129
['nld' 'ind' 'eng' 'cat' 'tsn' 'fin' 'kur' 'orm' 'nob' 'spa' 'rup' 'fra'
 'nob' 'eng' 'cat' 'ind' 'ind' 'tsn' 'rup' 'orm' 'fra' 'aze' 'lug' 'rup'
 'cat' 'ita' 'nob' 'hau']
['Schiedam is gelegen tussen Rotterdam en Vlaardingen, oorspronkelijk aan de Schie en later ook aan de Nieuwe Maas. Per 30 april 2017 had de gemeente 77.833 inwoners (bron: CBS). De stad is vooral bekend om haar jenever, de historische binnenstad met grachten, en de hoogste windmolens ter wereld.'
 'Argentina adalah sebuah negara yang kaya dengan SDA, tingkat melek huruf yang tinggi, sektor pertanian yang maju serta industri yang beragam. Malangnya, sejak akhir 1980-an negara ini telah menimbun hutang luar negeri yang tinggi, inflasi sampai 200% sebulan, dan pengeluaran yang merudum. Dalam mengatasi krisis ekonomi tersebut, pemerintahan telah mengambil langkah-langkah seperti liberalisasi perdagangan, deregulasi, dan swastanisasi. Pada 1991, pemerintahan telah melaksanakan reformasi fina