In [None]:
import pandas as pd
df=pd.read_csv("C:/Users/tomas/OneDrive/Υπολογιστής/Data.csv")
print(df.head())

In [None]:
print(df.isnull().sum())

In [None]:
df = df.drop(columns='feature_11')
print(df.info())

In [None]:
print(df.describe())

In [None]:
df.duplicated().sum()

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
import seaborn as sns

imputer = KNNImputer(n_neighbors=4)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
df = df_imputed
print(df.head())

In [None]:
original_columns = df.columns
features = df_imputed.columns.drop('target')
num_features = len(features)
rows = (num_features // 4) + 1
plt.figure(figsize=(20, 5 * rows))
for i, col in enumerate(features):
    plt.subplot(rows, 4, i + 1)
    sns.histplot(df_imputed[col], bins=10, kde=True, color='red')
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
Q1 = df.quantile(0.25) 
Q3 = df.quantile(0.75)  
IQR = Q3 - Q1           
outliers_iqr = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
outliers_per_column_iqr = outliers_iqr.sum()
print(outliers_per_column_iqr)

In [None]:
df_no_outliers = df[~outliers_iqr.any(axis=1)]
print("\nDataFrame without outliers:")
df = df_no_outliers
print(df)

In [None]:
print(df.info())

In [None]:
from sklearn.preprocessing import MinMaxScaler
numeric_cols = df.select_dtypes(include='float64').columns.tolist()  
if 'target' in numeric_cols:
    numeric_cols.remove('target')
scaler = MinMaxScaler()
df.loc[:, numeric_cols] = scaler.fit_transform(df[numeric_cols])
print(df.head())
print("Shape:", df.shape)

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(columns='target')
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42, stratify=y  # stratify διατηρεί την αναλογία των κατηγοριών
)
print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
rf_model = RandomForestClassifier(n_estimators=5,
                                  random_state=42,
                                  max_depth=5,
                                  n_jobs=-1,
                                  )
rf_model.fit(X_train, y_train)
y_pred_test = rf_model.predict(X_test)
y_pred_train = rf_model.predict(X_train)
print("Accuracy on Training Set:", accuracy_score(y_train, y_pred_train))
print("Accuracy on Test Set:", accuracy_score(y_test, y_pred_test))
print("\nConfusion Matrix on Test Set:\n", confusion_matrix(y_test, y_pred_test))

In [None]:
import pandas as pd
import numpy as np

feature_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)

print(feature_importances)

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances.index, y=feature_importances.values)

plt.xticks(rotation=90)
plt.xlabel("Χαρακτηριστικά (Features)")
plt.ylabel("Σημασία (Feature Importance)")
plt.title("Scree Plot - Σημασία Χαρακτηριστικών από Random Forest")
plt.tight_layout()
plt.show()

In [None]:
top_10_features = feature_importances.head(10).index
X_train_top10 = X_train[top_10_features]
X_test_top10 = X_test[top_10_features]

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
mlp = MLPClassifier(hidden_layer_sizes=(90),   
                    activation='relu',
                    solver='adam',
                    max_iter=1000,
                    random_state=42,
                   learning_rate_init=0.001)

mlp.fit(X_train_top10, y_train)
y_pred_train = mlp.predict(X_train_top10)
y_pred_test = mlp.predict(X_test_top10)
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

In [None]:
import pandas as pd
dfeva=pd.read_csv("C:/Users/tomas/OneDrive/Υπολογιστής/Evaluation.csv")
print(dfeva.head())

In [None]:
print(dfeva.isnull().sum())

In [None]:
dfeva= dfeva.drop(columns='feature_11')
print(dfeva.info())

In [None]:
dfeva.loc[:, numeric_cols] = scaler.transform(dfeva[numeric_cols])
print(dfeva.head())
print("Shape:", dfeva.shape)

In [None]:
selected_features = df.columns[:10].tolist()
dfeva[numeric_cols] = scaler.transform(dfeva[numeric_cols])
selected_features = df.columns[:10].tolist()
X_eval = dfeva[selected_features]

In [None]:
print(dfeva.head())

In [None]:
top10_features = ['feature_10', 'feature_5', 'feature_4', 'feature_9', 'feature_7',
                  'feature_6', 'feature_1', 'feature_2', 'feature_8', 'feature_3']

X_eval = dfeva[top10_features]
y_pred_eval = mlp.predict(X_eval)

In [None]:
predictions_df = pd.DataFrame({'prediction': y_pred_eval})
predictions_df.to_csv('mlp_predictions.csv', index=False)