In [5]:
from dash import Dash, html, dcc, Input, Output, State
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score
import base64
import io

app = Dash(__name__)

df = None
model_pipeline = None

app.layout = html.Div([
    html.H1("Milestone 4 Regression Application", style={'textAlign': 'center', 'margin-bottom': '20px'}),

    html.Div([
        html.H2("Upload File"),
        dcc.Upload(
            id='upload-data',
            children=html.Div(['Drag and Drop or ', html.A('Select a CSV File')]),
            style={
                'width': '100%',
                'height': '60px',
                'lineHeight': '60px',
                'borderWidth': '1px',
                'borderStyle': 'dashed',
                'borderRadius': '5px',
                'textAlign': 'center',
                'margin-bottom': '20px'
            },
            multiple=False
        ),
    ], style={'padding': '10px', 'border': '1px solid lightgrey'}),

    html.Div([
        html.Label("Select Target:", style={'font-weight': 'bold'}),
        dcc.Dropdown(id='target-dropdown', placeholder="Select Target Variable"),
    ], style={'margin': '20px 0'}),

    html.Div([
        html.Div([
            html.H4("Bar Chart - Average of Target by Category:"),
            dcc.RadioItems(id='category-radio', inline=True),
            dcc.Graph(id='bar-chart-1', style={'height': '400px'})
        ], style={'width': '48%', 'display': 'inline-block'}),

        html.Div([
            html.H4("Correlation Strength of Numerical Variables:"),
            dcc.Graph(id='bar-chart-2', style={'height': '400px'})
        ], style={'width': '48%', 'display': 'inline-block'})
    ]),

    html.Div([
        html.H4("Select Features for Training:"),
        dcc.Checklist(id='feature-checklist', inline=True)
    ], style={'margin': '20px 0'}),

    html.Div([
        html.H2("Train Model"),
        html.Button('Train', id='train-button', n_clicks=0),
        html.Div(id='training-output', style={'font-weight': 'bold', 'margin': '10px 0'}),
    ], style={'padding': '10px', 'border': '1px solid lightgrey', 'margin-top': '20px'}),

    html.Div([
        html.H2("Make Predictions"),
        dcc.Input(id='prediction-input', type='text', placeholder="Enter values (comma-separated)",
                  style={'width': '80%', 'margin-right': '10px'}),
        html.Button('Predict', id='predict-button', n_clicks=0),
        html.Div(id='prediction-output', style={'font-weight': 'bold', 'margin': '10px 0'}),
    ], style={'padding': '10px', 'border': '1px solid lightgrey', 'margin-top': '20px'})
])


@app.callback(
    [Output('target-dropdown', 'options'),
     Output('feature-checklist', 'options'),
     Output('category-radio', 'options')],
    Input('upload-data', 'contents'),
    State('upload-data', 'filename')
)
def handle_file(contents, filename):
    global df
    if contents is None:
        return [], [], []
    content_type, content_string = contents.split(',')
    decoded = base64.b64decode(content_string)
    df = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
    numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
    categorical_cols = df.select_dtypes(include='object').columns.tolist()
    return (
        [{'label': col, 'value': col} for col in numerical_cols],
        [{'label': col, 'value': col} for col in numerical_cols],
        [{'label': col, 'value': col} for col in categorical_cols]
    )


@app.callback(
    [Output('bar-chart-1', 'figure'),
     Output('bar-chart-2', 'figure')],
    [Input('target-dropdown', 'value'),
     Input('category-radio', 'value')]
)
def update_charts(target, category):
    if df is None or target is None or category is None:
        return {}, {}
    avg_data = df.groupby(category)[target].mean().reset_index()
    fig1 = px.bar(avg_data, x=category, y=target, title=f"Average {target} by {category}")
    corr_data = df.select_dtypes(include=np.number).corr()[target].abs().reset_index()
    fig2 = px.bar(corr_data, x='index', y=target, title=f"Correlation Strength of Numerical Variables with {target}")
    return fig1, fig2


@app.callback(
    Output('training-output', 'children'),
    Input('train-button', 'n_clicks'),
    [State('target-dropdown', 'value'), State('feature-checklist', 'value')]
)
def train_model(n_clicks, target, features):
    global model_pipeline
    if n_clicks == 0 or target is None or not features:
        return "Please upload data, select a target variable, and select features to train."

    X = df[features]
    y = df[target]

    num_imputer = SimpleImputer(strategy='mean')
    cat_imputer = SimpleImputer(strategy='most_frequent')

    num_cols = X.select_dtypes(include=["number"]).columns
    cat_cols = X.select_dtypes(include=["object"]).columns

    preprocessor = ColumnTransformer([
        ('num', Pipeline([('imputer', num_imputer), ('scaler', StandardScaler())]), num_cols),
        ('cat', Pipeline([('imputer', cat_imputer), ('encoder', OneHotEncoder(handle_unknown='ignore'))]), cat_cols)
    ])

    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    try:
        model_pipeline.fit(X_train, y_train)
        y_pred = model_pipeline.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        return f"Model trained! The R² score is: {r2:.2f}"
    except Exception as e:
        return f"Error during training: {str(e)}"


@app.callback(
    Output('prediction-output', 'children'),
    Input('predict-button', 'n_clicks'),
    [State('prediction-input', 'value'), State('feature-checklist', 'value')]
)
def make_prediction(n_clicks, input_values, features):
    if n_clicks == 0 or not input_values or model_pipeline is None or not features:
        return "Please train the model and input valid values."
    try:
        input_list = [float(x) for x in input_values.split(',')]
        if len(input_list) != len(features):
            return f"Error: Expected {len(features)} values, but got {len(input_list)}."
        input_df = pd.DataFrame([input_list], columns=features)
        prediction = model_pipeline.predict(input_df)[0]
        return f"Predicted value: {prediction:.2f}"
    except Exception as e:
        return f"Error in prediction: {str(e)}"


if __name__ == '__main__':
    app.run_server(debug=True)
