In [9]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define paths to healthy and broken-tooth data folders
healthy_data_folder = r'C:\Users\found\all anaconda\Untitled Folder 1\healthy_data_folder'
broken_tooth_data_folder = r'C:\Users\found\all anaconda\Untitled Folder 1\broken_tooth_data_folder'

# Function to load data from .txt files
def load_data(folder, label, delimiter='\t'):  # Adjust delimiter if necessary
    data = []
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        
        # Read the .txt file (adjust delimiter if needed)
        df = pd.read_csv(file_path, delimiter=delimiter)  # Use correct delimiter
        
        # Add a label column to indicate the class (0 = healthy, 1 = broken tooth)
        df['label'] = label
        data.append(df)
    
    return pd.concat(data, ignore_index=True)

# Load healthy and broken-tooth data
healthy_data = load_data(healthy_data_folder, label=0, delimiter='\t')  # Adjust delimiter if needed
broken_tooth_data = load_data(broken_tooth_data_folder, label=1, delimiter='\t')  # Adjust delimiter

# Combine the two datasets into one
data = pd.concat([healthy_data, broken_tooth_data], ignore_index=True)

# Define features and target
X = data.drop(columns=['label'])  # Drop the 'label' column to use the remaining columns as features
y = data['label']  # Use the 'label' column as the target

# Check for missing values
print("Missing values in X:")
print(X.isnull().sum())
print("Missing values in y:")
print(y.isnull().sum())

# Check shapes
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

# Handle missing values (if any)
X = X.fillna(X.mean())  # Fill missing values with the mean
y = y.fillna(y.mode()[0])  # Fill missing values in the target with the mode

# Ensure categorical features are properly encoded (if any)
X = pd.get_dummies(X, drop_first=True)

# Check the data types
print("Data types in X:")
print(X.dtypes)
print("Data types in y:")
print(y.dtypes)

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check shapes of train sets
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")

# Fit the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

try:
    rf_model.fit(X_train, y_train)
except ValueError as e:
    print("Error during model fitting:", e)
    print("X_train sample:", X_train.head())
    print("y_train sample:", y_train.head())
    raise  # Re-raise the error to show it in the output

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display predictions alongside actual values for comparison
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("Predictions vs Actual:")
print(results.head(10))  # Display the first 10 rows of actual vs predicted

# Example input data for prediction (adjust feature values accordingly)
feature_names = X.columns.tolist()  # Get feature names from the DataFrame
new_data = np.array([[0.5, 1.5, 2.0, 3.5, 4.0]])  # Adjust this array as needed

# Create a DataFrame for the new data
new_data_df = pd.DataFrame(new_data, columns=feature_names)

# Make the prediction
predicted_label = rf_model.predict(new_data_df)

# Output the prediction result
if predicted_label[0] == 0:
    print("The prediction for the input data is: Healthy")
else:
    print("The prediction for the input data is: Broken Tooth")

# Optional: Feature importance
importances = rf_model.feature_importances_
feature_importance = pd.DataFrame({'feature': X.columns, 'importance': importances}).sort_values(by='importance', ascending=False)
print("Feature Importances:")
print(feature_importance)


Missing values in X:
4.63671e+000     1932268
5.16978e-001     1932268
-3.20594e+000    1932268
1.82241e+000     1932268
Unnamed: 4       2021099
                  ...   
-2.16663e+000    1910765
-5.51068e+000    1915372
4.80685e+000     1915372
-1.42651e+000    1915372
5.08883e-001     1915372
Length: 81, dtype: int64
Missing values in y:
0
Shape of X: (2021099, 81)
Shape of y: (2021099,)
Data types in X:
4.63671e+000     float64
5.16978e-001     float64
-3.20594e+000    float64
1.82241e+000     float64
Unnamed: 4       float64
                  ...   
-2.16663e+000    float64
-5.51068e+000    float64
4.80685e+000     float64
-1.42651e+000    float64
5.08883e-001     float64
Length: 81, dtype: object
Data types in y:
int64
Shape of X_train: (1616879, 81)
Shape of y_train: (1616879,)
Error during model fitting: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGra

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values