In [None]:
# Import necessary libraries (Shell 1)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, mean_squared_error
import pickle

# Load the dataset (Shell 2)
df = pd.read_excel('Copper_Set.xlsx')

# Data Understanding and Exploration (Shell 3)
# Display the first few rows of the dataset
st.write("Data preview:")
st.dataframe(df.head())

# Identify the types of variables
st.write("Data types:")
st.write(df.dtypes)

# Check for missing values
st.write("Missing values:")
st.write(df.isna().sum())

# Check for skewness and outliers using Seaborn charts
sns.boxplot(data=df)
plt.title('Boxplot to identify outliers')
st.pyplot(plt)

sns.histplot(df['selling_price'], kde=True)
plt.title('Distribution of Selling Price')
st.pyplot(plt)

# Data Preprocessing (Shell 4)
# Handle missing values
imputer = SimpleImputer(strategy='most_frequent')
df['material_ref'] = imputer.fit_transform(df[['material_ref']])

# Treat reference columns as categorical variables
df['material_ref'] = df['material_ref'].astype('category')

# Handle skewness using log transformation for 'selling_price'
df['selling_price'] = np.log1p(df['selling_price'])

# Encoding categorical variables
label_encoder = LabelEncoder()
df['status'] = label_encoder.fit_transform(df['status'])

# Split the dataset into train and test sets (Shell 5)
X = df.drop(['selling_price', 'status'], axis=1)
y_regression = df['selling_price']
y_classification = df['status']

X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_regression, test_size=0.2, random_state=42)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X, y_classification, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Regression Model (Shell 6)
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train_scaled, y_train_reg)
y_pred_reg = regressor.predict(X_test_scaled)

mse = mean_squared_error(y_test_reg, y_pred_reg)
st.write(f'Mean Squared Error (Regression Model): {mse}')

# Classification Model (Shell 7)
classifier = ExtraTreesClassifier(random_state=42)
classifier.fit(X_train_class, y_train_class)
y_pred_class = classifier.predict(X_test_class)

accuracy = accuracy_score(y_test_class, y_pred_class)
precision = precision_score(y_test_class, y_pred_class)
recall = recall_score(y_test_class, y_pred_class)
f1 = f1_score(y_test_class, y_pred_class)
st.write(f'Accuracy: {accuracy}')
st.write(f'Precision: {precision}')
st.write(f'Recall: {recall}')
st.write(f'F1 Score: {f1}')

# Save models and encoders using Pickle (Shell 8)
with open('regression_model.pkl', 'wb') as f:
    pickle.dump(regressor, f)
with open('classification_model.pkl', 'wb') as f:
    pickle.dump(classifier, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Streamlit User Interface (Shell 9)
def predict(input_data, task):
    if task == 'Regression':
        model = pickle.load(open('regression_model.pkl', 'rb'))
        prediction = model.predict(scaler.transform(input_data))
        prediction = np.expm1(prediction)  # Reverse transformation of log to get original scale
        return prediction
    elif task == 'Classification':
        model = pickle.load(open('classification_model.pkl', 'rb'))
        prediction = model.predict(input_data)
        return label_encoder.inverse_transform(prediction)

# Interactive Streamlit App
st.title("Industrial Copper Modeling Prediction")

# User input fields for model prediction
task = st.selectbox('Choose a task', ['Regression', 'Classification'])
input_data = []
for column in X.columns:
    value = st.text_input(f'Enter value for {column}:')
    input_data.append(value)

input_data = np.array(input_data).reshape(1, -1)
if st.button('Predict'):
    result = predict(input_data, task)
    st.write(f'Prediction: {result}')

# Visualizations (Shell 10)
labels = ['WON', 'LOST']
values = df['status'].value_counts().tolist()
plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title('Classification of Status')
st.pyplot(plt)

sns.scatterplot(data=df, x='thickness', y='selling_price')
plt.title('Thickness vs Selling Price')
st.pyplot(plt)

# Readme for GitHub (Shell 11)
readme_content = """
# Industrial Copper Modeling Project

This project addresses the challenges in the copper industry by building a regression model to predict selling prices and a classification model to classify lead status as WON or LOST. 

### Steps
1. **Data Understanding**: Performed data exploration to understand variable types and their distributions.
2. **Data Preprocessing**: Handled missing values, skewness, and encoded categorical variables.
3. **EDA**: Used Seaborn to visualize the data.
4. **Model Building**: Built Regression and Classification models using tree-based algorithms.
5. **Web App with Streamlit**: Created an interactive page for regression and classification predictions.

#### Usage
- Install necessary libraries using `requirements.txt`
- Run the notebook to train the models and save them.
- Use `streamlit run app.py` to deploy the web application.

### Project Evaluation Metrics
- Regression model: Mean Squared Error
- Classification model: Accuracy, Precision, Recall, F1 Score

"""

# Save README.md
with open('README.md', 'w') as readme_file:
    readme_file.write(readme_content)


2024-10-29 17:06:26.784 
  command:

    streamlit run C:\Users\Sourav Kamble\Anaconda Prompt\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-10-29 17:06:26.815 Serialization of dataframe to Arrow table was unsuccessful due to: ("Could not convert dtype('O') with type numpy.dtype[object_]: did not recognize Python value type when inferring an Arrow data type", 'Conversion failed for column 0 with type object'). Applying automatic fixes for column types to make the dataframe Arrow-compatible.
