<a href="https://colab.research.google.com/github/Srinidhi231/activity/blob/main/Untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

# Step 1: Load training data
train_df = pd.read_csv('train.csv')

# Step 2: Split features (X) and target labels (y)
X = train_df.drop(columns=['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'])
y = train_df[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

# Step 3: Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train a multi-output random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
model = MultiOutputClassifier(rf)
model.fit(X_train, y_train)

# Step 5: Evaluate model on validation data
y_pred = model.predict(X_val)
for i, col in enumerate(y.columns):
    print(f"--- {col} ---")
    print(classification_report(y_val[col], y_pred[:, i]))

# Step 6: Save the trained model to a .pkl file
with open('multi_output_rf_model.pkl', 'wb') as file:
    pickle.dump(model, file)
    print("Model saved as 'multi_output_rf_model.pkl'")

--- Pastry ---
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      3543
           1       0.52      0.04      0.07       301

    accuracy                           0.92      3844
   macro avg       0.72      0.52      0.52      3844
weighted avg       0.89      0.92      0.89      3844

--- Z_Scratch ---
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      3624
           1       0.65      0.37      0.47       220

    accuracy                           0.95      3844
   macro avg       0.81      0.68      0.72      3844
weighted avg       0.94      0.95      0.95      3844

--- K_Scatch ---
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3162
           1       0.91      0.90      0.90       682

    accuracy                           0.97      3844
   macro avg       0.94      0.94      0.94      3844
weighted avg       0.97 

In [2]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# Step 1: Load the trained model (from the pickle file)
with open('multi_output_rf_model.pkl', 'rb') as file:
    model = pickle.load(file)

# Step 2: Load the test data
test_df = pd.read_csv('test.csv')

# Step 3: Check the columns in the test dataset
print("Test data columns:", test_df.columns)
print("Test data shape:", test_df.shape)

# Step 4: Load the sample submission to understand the expected format
sample_submission = pd.read_csv('sample_submission.csv')
print("Sample submission shape:", sample_submission.shape)

# Step 5: Define the target label columns
label_columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# Step 6: Prepare the feature data (X_test) by dropping any label columns if present
X_test = test_df.drop(columns=[col for col in label_columns if col in test_df.columns])
print("Feature matrix shape:", X_test.shape)

# Step 7: Make predictions on the test data
probabilities = pd.DataFrame(index=X_test.index)

# Handle the probability extraction based on the model type
if isinstance(model, MultiOutputClassifier):
    # For MultiOutputClassifier, extract probabilities for each classifier
    estimators = model.estimators_
    for i, col in enumerate(label_columns):
        # Extract probability for class 1 (positive class)
        probabilities[col] = estimators[i].predict_proba(X_test)[:, 1]
else:
    # If the model is a direct RandomForestClassifier with predict_proba capability
    y_pred_prob = model.predict_proba(X_test)
    for i, col in enumerate(label_columns):
        probabilities[col] = y_pred_prob[i][:, 1]

print("Probabilities shape:", probabilities.shape)

# Step 8: Ensure we have the same number of predictions as expected in the sample submission
# If they don't match, we need to understand why
if len(probabilities) != len(sample_submission):
    print(f"WARNING: Number of predictions ({len(probabilities)}) doesn't match sample submission ({len(sample_submission)})")

    # Check if sample_submission has an ID column we can use to align data
    id_col = None
    for col in sample_submission.columns:
        if col not in label_columns:
            id_col = col
            break

    if id_col and id_col in test_df.columns:
        print(f"Using {id_col} to align predictions with sample submission format")
        # Create a new dataframe with sample_submission IDs
        aligned_probabilities = pd.DataFrame(index=sample_submission.index)

        # Map test predictions to sample submission format using the ID column
        for col in label_columns:
            # Create a mapping from test ID to probability
            prob_map = dict(zip(test_df[id_col], probabilities[col]))
            # Map the probabilities to sample submission IDs
            aligned_probabilities[col] = sample_submission[id_col].map(prob_map)

        probabilities = aligned_probabilities

# Step 9: Ensure the order of columns in the submission matches the sample file
submission = sample_submission.copy()
for col in label_columns:
    if col in submission.columns:
        submission[col] = probabilities[col].values

# Step 10: Verify the submission format
print("Final submission shape:", submission.shape)
print("Submission columns:", submission.columns)

# Step 11: Save the submission file to CSV
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully!")

Test data columns: Index(['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas'],
      dtype='object')
Test data shape: (12814, 28)
Sample submission shape: (12814, 8)
Feature matrix shape: (12814, 28)
Probabilities shape: (12814, 7)
Final submission shape: (12814, 8)
Submission columns: Index(['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps',
       'Other_Faults'],
      dtype='object')
Submission file 'submission.csv' created successfully!
