In [None]:
import faiss
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression

file_path = 'data/data.csv'
data = pd.read_csv(file_path)

user_id_column = 'user'
features = data.drop(columns=['target', user_id_column])
target = data['target']

section1_label_mapping = {
    'მძულს': 0,
    'არ მომწონს': 1,
    'ნეიტრალური ვარ': 2,
    'მომწონს': 3,
    'მიყვარს': 4
}

target_encoder = LabelEncoder()
target_encoder.fit(['a', 'b', 'g', 'd', 'e', 'v', 'z', 't', 'i', 'k'])
target_encoded = target_encoder.transform(target)

label_encoders = {}
for column in features.columns:
    le = LabelEncoder()
    unique_values = list(map(str, features[column].unique().tolist()))
    le.fit(unique_values)
    features[column] = le.transform(list(map(str, features[column])))
    label_encoders[column] = le

encoded_data = pd.concat([data[[user_id_column]], features, pd.Series(target_encoded, name='target')], axis=1)

train_data, val_data = train_test_split(encoded_data, test_size=0.2, random_state=42)

X_train = train_data.drop(columns=[user_id_column, 'target']).values.astype(np.float32)
y_train = train_data['target']

X_train = np.ascontiguousarray(X_train)

faiss.normalize_L2(X_train)

d = X_train.shape[1]
faiss_index = faiss.IndexFlatIP(d)
faiss_index.add(X_train)
# faiss.write_index(faiss_index, 'models/faiss_index.index')

target_labels = y_train.values
target_labels_filename = 'models/target_labels.pkl'
# with open(target_labels_filename, 'wb') as file:
#     pickle.dump(target_labels, file)

# print(f"Target labels saved to models/{target_labels_filename}")
#
target_encoder_filename = 'models/target_encoder.pkl'
# with open(target_encoder_filename, 'wb') as file:
#     pickle.dump(target_encoder, file)

# print(f"Target encoder saved to models/{target_encoder_filename}")

In [None]:
# ვტვირთავთ ვალიდაციის მონაცემებს
X_val = val_data.drop(columns=[user_id_column, 'target']).values.astype(np.float32)
y_val = val_data['target'].values

# ვტვირთავთ FAISS ინდექსს
faiss_index = faiss.read_index('models/faiss_index.index')

# ვტვირთავთ სამიზნე ეტიკეტებს და ენკოდერს
with open('models/target_labels.pkl', 'rb') as file:
    target_labels = pickle.load(file)

with open('models/target_encoder.pkl', 'rb') as file:
    target_encoder = pickle.load(file)

# FAISS პროგნოზები
def get_faiss_predictions(faiss_index, X, k=1):
    X = np.ascontiguousarray(X)
    faiss.normalize_L2(X)
    _, I = faiss_index.search(X, k)
    return target_labels[I.flatten()]

faiss_pred = get_faiss_predictions(faiss_index, X_val)

with open('models/random_forest_model.pkl', 'rb') as file:
    rf_model = pickle.load(file)

rf_pred = rf_model.predict(X_val)

# მეტრიკების გამოთვლა FAISS-ისთვის
print("FAISS მეტრიკები:")
print("სიზუსტე:", accuracy_score(y_val, faiss_pred))
print("პრეციზია:", precision_score(y_val, faiss_pred, average='weighted'))
print("გამოხმობა:", recall_score(y_val, faiss_pred, average='weighted'))
print("F1 ქულა:", f1_score(y_val, faiss_pred, average='weighted'))
print("\nკლასიფიკაციის ანგარიში:")
print(classification_report(y_val, faiss_pred, target_names=target_encoder.classes_))

# მეტრიკების გამოთვლა Random Forest-ისთვის
print("\nRandom Forest მეტრიკები:")
print("სიზუსტე:", accuracy_score(y_val, rf_pred))
print("პრეციზია:", precision_score(y_val, rf_pred, average='weighted'))
print("გამოხმობა:", recall_score(y_val, rf_pred, average='weighted'))
print("F1 ქულა:", f1_score(y_val, rf_pred, average='weighted'))
print("\nკლასიფიკაციის ანგარიში:")
print(classification_report(y_val, rf_pred, target_names=target_encoder.classes_))

In [None]:
file_path = 'data/data.csv'
data = pd.read_csv(file_path)

X = data.drop(columns=['target', 'user'])
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'penalty': ['l2'],  # Regularization penalty
    'C': [0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['lbfgs'],  # Solver
    'max_iter': [200, 300, 400]  # Number of iterations
}

model = LogisticRegression(multi_class='multinomial')

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

with open('models/logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

best_params, accuracy, report

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

accuracy, precision, recall, f1

EDA

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency, ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('data/updated_data.csv')
mapping = {
    'a': 'პროგრამისტი',
    'b': 'ექიმი',
    'g': 'ფინანსური ანალიტიკოსი',
    'd': 'მასწავლებელი',
    'e': 'არქიტექტორი',
    'v': 'იურისტი',
    'z': 'ფსიქოლოგი',
    't': 'ჟურნალისტი',
    'i': 'ინჟინერი',
    'k': 'დიზაინერი'
}

data['target'] = data['target'].map(mapping)
for col in data.select_dtypes(include='object').columns:
    data[col] = data[col].astype('category')


def chi_square_tests(data, target_col):
    chi_square_results = {}
    categorical_cols = data.select_dtypes(include='category').columns
    for col in categorical_cols:
        if col != target_col:
            contingency_table = pd.crosstab(data[target_col], data[col])
            chi2_stat, p_val, dof, ex = chi2_contingency(contingency_table)
            chi_square_results[col] = {'chi2_stat': chi2_stat, 'p_val': p_val}
    return chi_square_results

chi_square_results = chi_square_tests(data[1:], 'target')

categorical_summary = data.drop('user', axis=1).describe(include='category').T
numerical_summary = data.drop('user', axis=1).describe(include='int64').T

chi_square_results_df = pd.DataFrame.from_dict(chi_square_results, orient='index')

print("Chi-Square Test Results:")
print(chi_square_results_df)
print("\nCategorical Summary:")
print(categorical_summary)
print("\nNumerical Summary:")
print(numerical_summary)


plt.figure(figsize=(10, 6))
sns.countplot(x='target', data=data, order=data['target'].value_counts().index, palette='viridis')
plt.title('სამიზნე ცვლადის სიხშირული განაწილება')
plt.xlabel('სამიზნე')
plt.ylabel('რაოდენობა')
plt.xticks(rotation=45)
plt.show()
selected_columns = [col for col in data.columns if col.startswith('q') and int(col[1:]) >= 50]
corr_matrix = data[selected_columns].corr()

plt.figure(figsize=(24, 16))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('კორელაციის მატრიცა რიცხვითი ცვლადებისთვის')
plt.show()
corr_matrix.to_csv('data/corr.csv')