In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Load the Excel file
file_path = '/content/synthetic_crude_oil_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

In [None]:
# Inspect the first few rows of the dataset
print(data.head())


   API_Gravity          GOR   Color Crude_Type
0    24.981605   777.699393    dark      heavy
1    48.028572  1312.851421    dark      heavy
2    39.279758  1809.418754    dark      heavy
3    33.946339  1598.337330  medium      heavy
4    16.240746  1709.841722  medium      heavy


In [None]:
# Apply One-Hot Encoding to the 'Color' column
encoder = OneHotEncoder(sparse=False, drop='first')  # Drop first to avoid multicollinearity
color_encoded = encoder.fit_transform(data[['Color']])

KeyError: "None of [Index(['Color'], dtype='object')] are in the [columns]"

In [None]:
# Convert the encoded array back to a DataFrame
color_encoded_df = pd.DataFrame(color_encoded, columns=encoder.get_feature_names_out(['Color']))


In [None]:
# Concatenate the original data with the encoded color columns
data = pd.concat([data.drop('Color', axis=1), color_encoded_df], axis=1)

In [None]:
# Define the features (independent variables) and the target (dependent variable)
X = data.drop(columns=['Crude_Type'])  # Features
y = data['Crude_Type']  # Target (crude oil type)

In [None]:
# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)

In [None]:
# Train the model
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 100.00%


In [None]:
model = RandomForestClassifier(random_state=42)
scores = cross_val_score(model, X, y, cv=5)

In [None]:
print(f"Cross-validation scores: {scores}")
print(f"Mean cross-validation score: {scores.mean()}")

Cross-validation scores: [1.    1.    0.975 0.995 0.985]
Mean cross-validation score: 0.991


In [None]:
# Train the model on the entire dataset
model.fit(X, y)

In [None]:
# Define the input values for prediction
input_data = pd.DataFrame({
    'API_Gravity': [30.8885970064779],
    'GOR': [847.507552075165],
    'Color_light': [1],  # Assuming 'light' is one of the encoded columns
    'Color_medium': [0],  # Ensure these match the column names in the encoded DataFrame
    'Color_heavy': [0]    # Ensure these match the column names in the encoded DataFrame
})

In [None]:
# Predict the type of crude oil
predicted_type = model.predict(input_data)
print(f'Predicted type of crude oil: {predicted_type[0]}')

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Color_heavy
Feature names seen at fit time, yet now missing:
- Color_dark


In [None]:
print('Classification Report:')
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

       heavy       1.00      1.00      1.00       228
       light       1.00      1.00      1.00        17
      medium       1.00      1.00      1.00        55

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300

