In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

healthy_data_folder = r'C:\Users\found\all anaconda\Untitled Folder 1\healthy_data_folder'
broken_tooth_data_folder = r'C:\Users\found\all anaconda\Untitled Folder 1\broken_tooth_data_folder'

def load_data(folder, label, delimiter='\t'):
    data = []
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        df = pd.read_csv(file_path, delimiter=delimiter)
        df['label'] = label
        data.append(df)
    return pd.concat(data, ignore_index=True)

healthy_data = load_data(healthy_data_folder, label=0)
broken_tooth_data = load_data(broken_tooth_data_folder, label=1)

data = pd.concat([healthy_data, broken_tooth_data], ignore_index=True)

X = data.drop(columns=['label'])
y = data['label']

# Print the initial shape
print(f"Shape of X before imputation: {X.shape}")
print("Missing values in X:")
print(X.isnull().sum())

for col in X.select_dtypes(include=['object']).columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')

print("Missing values in X after conversion:")
print(X.isnull().sum())

imputer = SimpleImputer(strategy='mean')

X_imputed = imputer.fit_transform(X)

print(f"Shape of X_imputed: {X_imputed.shape}")

if X_imputed.shape[1] != X.shape[1]:
    print("Warning: The number of columns has changed after imputation.")

X_imputed = pd.DataFrame(X_imputed, columns=X.columns[:X_imputed.shape[1]])

print(f"Shape of X after imputation: {X_imputed.shape}")

X_encoded = pd.get_dummies(X_imputed, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Predictions vs Actual
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("Predictions vs Actual:")
print(results.head(10))

# Sample new data for prediction
new_data = np.array([[0.5, 1.5, 2.0, 3.5, 4.0]])  # Example values, adjust based on your features
new_data_df = pd.DataFrame(new_data, columns=X_encoded.columns)  # Make sure it matches X_encoded

predicted_label = rf_model.predict(new_data_df)

if predicted_label[0] == 0:
    print("The prediction for the input data is: Healthy")
else:
    print("The prediction for the input data is: Broken Tooth")


Shape of X before imputation: (2021099, 81)
Missing values in X:
4.63671e+000     1932268
5.16978e-001     1932268
-3.20594e+000    1932268
1.82241e+000     1932268
Unnamed: 4       2021099
                  ...   
-2.16663e+000    1910765
-5.51068e+000    1915372
4.80685e+000     1915372
-1.42651e+000    1915372
5.08883e-001     1915372
Length: 81, dtype: int64
Missing values in X after conversion:
4.63671e+000     1932268
5.16978e-001     1932268
-3.20594e+000    1932268
1.82241e+000     1932268
Unnamed: 4       2021099
                  ...   
-2.16663e+000    1910765
-5.51068e+000    1915372
4.80685e+000     1915372
-1.42651e+000    1915372
5.08883e-001     1915372
Length: 81, dtype: int64
Shape of X_imputed: (2021099, 80)
Shape of X after imputation: (2021099, 80)
