<a href="https://colab.research.google.com/github/Poojitha12345678/Alpha-Creative-Contests/blob/main/Alpha_Creative_Contests_NPI_Survey_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import joblib

In [25]:
file_path = '/content/dummy_npi_data.xlsx'
dataset = pd.read_excel(file_path)

In [26]:
dataset.head()

Unnamed: 0,NPI,State,Login Time,Logout Time,Usage Time (mins),Region,Speciality,Count of Survey Attempts
0,1000000000,NY,2025-03-08 06:09:00,2025-03-08 06:28:00,19,Northeast,Cardiology,3
1,1000000001,MI,2025-03-08 12:28:00,2025-03-08 13:10:00,42,Midwest,Oncology,5
2,1000000002,CA,2025-03-08 15:11:00,2025-03-08 15:37:00,26,West,Oncology,8
3,1000000003,TX,2025-03-08 14:17:00,2025-03-08 15:36:00,79,Northeast,Orthopedics,9
4,1000000004,GA,2025-03-08 15:59:00,2025-03-08 17:37:00,98,West,Oncology,0


In [27]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   NPI                       1000 non-null   int64         
 1   State                     1000 non-null   object        
 2   Login Time                1000 non-null   datetime64[ns]
 3   Logout Time               1000 non-null   datetime64[ns]
 4   Usage Time (mins)         1000 non-null   int64         
 5   Region                    1000 non-null   object        
 6   Speciality                1000 non-null   object        
 7   Count of Survey Attempts  1000 non-null   int64         
dtypes: datetime64[ns](2), int64(3), object(3)
memory usage: 62.6+ KB


In [28]:
#No missing values

In [29]:
#extract relevant time feature
dataset['Login Hours'] = dataset['Login Time'].dt.hour
dataset['Logout Time'] = dataset['Logout Time'].dt.hour

In [30]:
dataset.head()

Unnamed: 0,NPI,State,Login Time,Logout Time,Usage Time (mins),Region,Speciality,Count of Survey Attempts,Login Hours
0,1000000000,NY,2025-03-08 06:09:00,6,19,Northeast,Cardiology,3,6
1,1000000001,MI,2025-03-08 12:28:00,13,42,Midwest,Oncology,5,12
2,1000000002,CA,2025-03-08 15:11:00,15,26,West,Oncology,8,15
3,1000000003,TX,2025-03-08 14:17:00,15,79,Northeast,Orthopedics,9,14
4,1000000004,GA,2025-03-08 15:59:00,17,98,West,Oncology,0,15


In [31]:
#Define Target variable
dataset['Survey_Participation'] = (dataset['Count of Survey Attempts'] > dataset['Count of Survey Attempts'].median()).astype(int)

In [32]:
dataset.head()

Unnamed: 0,NPI,State,Login Time,Logout Time,Usage Time (mins),Region,Speciality,Count of Survey Attempts,Login Hours,Survey_Participation
0,1000000000,NY,2025-03-08 06:09:00,6,19,Northeast,Cardiology,3,6,0
1,1000000001,MI,2025-03-08 12:28:00,13,42,Midwest,Oncology,5,12,0
2,1000000002,CA,2025-03-08 15:11:00,15,26,West,Oncology,8,15,1
3,1000000003,TX,2025-03-08 14:17:00,15,79,Northeast,Orthopedics,9,14,1
4,1000000004,GA,2025-03-08 15:59:00,17,98,West,Oncology,0,15,0


In [33]:
#select features
features = ['Login Hours', 'Logout Time','Usage Time (mins)', 'Count of Survey Attempts']
X = dataset[features]
y = dataset['Survey_Participation']

In [34]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
#Normalize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [36]:
#Train Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

In [37]:
#Evalute
accuracy = rf_classifier.score(X_test_scaled, y_test)
print(f'Accuracy: {accuracy}')

Accuracy: 1.0


In [38]:
#save the model
joblib.dump(rf_classifier, 'NPI_survey_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model Training Completed")

Model Training Completed


In [39]:
#Flask

In [40]:
from flask import Flask , request , jsonify , render_template

In [41]:
model = joblib.load('NPI_survey_model.pkl')
scaler = joblib.load('scaler.pkl')
df = pd.read_excel('/content/dummy_npi_data.xlsx')

app = Flask(__name__)

@app.route("/")
def home():
    return render_template("index.html")

@app.route("/predict", methods=["POST"])
def predict():
    login_hours = int(request.form["hour"])

    #prepare data for prediction
    df['Login Time'] = df['Login Time'].dt.hour
    df['Logout Time'] = df['Logout Time'].dt.hour
    input_data = df.copy()
    input_data['Login Time'] = login_hours

    #select features
    features = ['Login Time', 'Logout Time' ,'Usage Time (mins)', 'Count of Survey Attempts']
    X_input = input_data[features]
    X_scaled = scaler.transform(X_input)

    #predict
    input_data["Survey_Probability"] = model.predict_proba(X_scaled)[:,1]

    #return top doctors
    top_doctors = input_data.sort_values(by='Survey_Probability', ascending=False)[['NPI','Survey_Probability']].head(10)

    top_doctors.to_csv('top_doctors.csv', index=False)

    return render_template("index.html", predictions=top_doctors.to_dict(orient="records"))

if __name__ == '__main__':
  app.run()



 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
