In [16]:
## Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score,confusion_matrix,classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector as selector
from sklearn.ensemble import VotingRegressor, VotingClassifier
from sklearn.multioutput import MultiOutputClassifier

In [2]:
## Importing Dataset
df = pd.read_csv('Myers Briggs Table_S1.csv')
df.head()

Unnamed: 0,S No,AGE,HEIGHT,WEIGHT,SEX,ACTIVITY LEVEL,PAIN 1,PAIN 2,PAIN 3,PAIN 4,MBTI,E,I,S,N,T,F,J,P,POSTURE
0,1,53,62,125,Female,Low,0.0,0.0,0.0,0.0,ESFJ,18,3,17,9,9,13,18,4,A
1,2,52,69,157,Male,High,7.0,8.0,5.0,3.0,ISTJ,6,15,14,12,21,3,13,9,B
2,3,30,69,200,Male,High,0.0,0.0,0.0,0.0,ESTJ,15,6,16,10,15,9,12,10,A
3,4,51,66,175,Male,Moderate,9.5,9.5,9.5,1.5,ISTJ,6,15,21,5,13,11,19,3,D
4,5,45,63,199,Female,Moderate,4.0,5.0,2.0,2.0,ENFJ,14,7,20,6,9,15,16,6,A


In [3]:
## copying data set and droping unwanted columns
df1 = df.copy()
df1.drop(['S No','AGE','HEIGHT','WEIGHT','ACTIVITY LEVEL'],axis=1,inplace=True)
df1.head()

Unnamed: 0,SEX,PAIN 1,PAIN 2,PAIN 3,PAIN 4,MBTI,E,I,S,N,T,F,J,P,POSTURE
0,Female,0.0,0.0,0.0,0.0,ESFJ,18,3,17,9,9,13,18,4,A
1,Male,7.0,8.0,5.0,3.0,ISTJ,6,15,14,12,21,3,13,9,B
2,Male,0.0,0.0,0.0,0.0,ESTJ,15,6,16,10,15,9,12,10,A
3,Male,9.5,9.5,9.5,1.5,ISTJ,6,15,21,5,13,11,19,3,D
4,Female,4.0,5.0,2.0,2.0,ENFJ,14,7,20,6,9,15,16,6,A


In [4]:
## Label Encoding
le = LabelEncoder()
df1['SEX'] = le.fit_transform(df1['SEX'])

df1.head()

Unnamed: 0,SEX,PAIN 1,PAIN 2,PAIN 3,PAIN 4,MBTI,E,I,S,N,T,F,J,P,POSTURE
0,0,0.0,0.0,0.0,0.0,ESFJ,18,3,17,9,9,13,18,4,A
1,1,7.0,8.0,5.0,3.0,ISTJ,6,15,14,12,21,3,13,9,B
2,1,0.0,0.0,0.0,0.0,ESTJ,15,6,16,10,15,9,12,10,A
3,1,9.5,9.5,9.5,1.5,ISTJ,6,15,21,5,13,11,19,3,D
4,0,4.0,5.0,2.0,2.0,ENFJ,14,7,20,6,9,15,16,6,A


In [5]:
## splitting data into X and y
Xr = df1[['SEX','E','I','N','S','T','F','J','P']]
yr = df1[['PAIN 1','PAIN 2','PAIN 3','PAIN 4']]


In [6]:
## splitting data into train and test
for i in range(0,40):
    Xr_train,Xr_test,yr_train,yr_test = train_test_split(Xr,yr,test_size=0.2,random_state= i)
    regressor = LinearRegression()
    regressor.fit(Xr_train, yr_train)
    print("Test Score for random state",i,"is",regressor.score(Xr_test,yr_test)*100)


Test Score for random state 0 is -15.719327137794686
Test Score for random state 1 is -39.985980020102545
Test Score for random state 2 is -1.5696366113999853
Test Score for random state 3 is -22.967479936186706
Test Score for random state 4 is -30.230201416516184
Test Score for random state 5 is -13.492503233256937
Test Score for random state 6 is -13.30144427662303
Test Score for random state 7 is -61.850317695166446
Test Score for random state 8 is -14.926478750385257
Test Score for random state 9 is -25.308301917430125
Test Score for random state 10 is -14.04881432507713
Test Score for random state 11 is -16.084868776844147
Test Score for random state 12 is -41.178095621903054
Test Score for random state 13 is -22.7489736784744
Test Score for random state 14 is -9.230696057393379
Test Score for random state 15 is -1.3357113343661937
Test Score for random state 16 is -53.189547434719685
Test Score for random state 17 is -25.05181428259763
Test Score for random state 18 is -10.786709

In [8]:
for clf in (LinearRegression(), RandomForestRegressor(),KNeighborsRegressor()):
    clf.fit(Xr_train, yr_train)
    y_pred = clf.predict(Xr_test)
    print("\n_______________________________________________________\n")
    print(clf.__class__.__name__,r2_score(yr_test, y_pred))
    print(f'the MAE score is:{mean_absolute_error(yr_test, y_pred)}')
    print(f'the MSE is:{mean_squared_error(yr_test, y_pred)}')
    print(f'the r2-score is:{r2_score(yr_test, y_pred)}')
    print("\n_______________________________________________________\n")


_______________________________________________________

LinearRegression -0.6671088460066603
the MAE score is:2.9029817165880347
the MSE is:13.16936209005551
the r2-score is:-0.6671088460066603

_______________________________________________________


_______________________________________________________

RandomForestRegressor -0.2265039689770479
the MAE score is:2.5750208333333333
the MSE is:9.542432048611111
the r2-score is:-0.2265039689770479

_______________________________________________________


_______________________________________________________

KNeighborsRegressor -0.2963162147915522
the MAE score is:2.6337499999999996
the MSE is:10.065375000000003
the r2-score is:-0.2963162147915522

_______________________________________________________



In [42]:
Xc = df1[['E','I','N','S','T','F','J','P']]
yc = df1[['MBTI','POSTURE']]

In [51]:
E=np.std(df1['E'])
I=np.std(df1['I'])
N=np.std(df1['N'])
S=np.std(df1['S'])
T=np.std(df1['T'])
F=np.std(df1['F'])
J=np.std(df1['J'])
P=np.std(df1['P'])


In [83]:
dataset_b=[]
for _,row in df1.iterrows():
        temp={
            'E':row['E'],
            'I':row['I'],
            'N':row['N'],
            'S':row['S'],
            'T':row['T'],
            'F':row['F'],
            'J':row['J'],
            'P':row['P'],
            'MBTI':row['MBTI'],
            'POSTURE':row['POSTURE']
            }
        dataset_b.append(temp)
for _ in range(1000):
    for _,row in df1.iterrows():
        temp={
                'E':row['E']+np.random.normal(E),
                'I':row['I']+np.random.normal(I),
                'N':row['N']+np.random.normal(N),
                'S':row['S']+np.random.normal(S),
                'T':row['T']+np.random.normal(T),
                'F':row['F']+np.random.normal(F),
                'J':row['J']+np.random.normal(J),
                'P':row['P']+np.random.normal(P),
                'MBTI':row['MBTI'],
                'POSTURE':row['POSTURE']
                }
        dataset_b.append(temp)
df2=pd.DataFrame(dataset_b)
df2.head()

Unnamed: 0,E,I,N,S,T,F,J,P,MBTI,POSTURE
0,18.0,3.0,9.0,17.0,9.0,13.0,18.0,4.0,ESFJ,A
1,6.0,15.0,12.0,14.0,21.0,3.0,13.0,9.0,ISTJ,B
2,15.0,6.0,10.0,16.0,15.0,9.0,12.0,10.0,ESTJ,A
3,6.0,15.0,5.0,21.0,13.0,11.0,19.0,3.0,ISTJ,D
4,14.0,7.0,6.0,20.0,9.0,15.0,16.0,6.0,ENFJ,A


In [87]:
## splitting data into X and y
Xc = df2[['E','I','N','S','T','F','J','P']]
yc = df2[['MBTI','POSTURE']]

In [85]:
x_orignal = df1[['E','I','N','S','T','F','J','P']]
y_orignal = df1[['MBTI','POSTURE']]

In [88]:
## splitting data into train and test
x_orignal_train,x_orignal_test,y_orignal_train,y_orignal_test = train_test_split(x_orignal,y_orignal,test_size=0.2,random_state= 0)

In [89]:
Xc_train,Xc_test,yc_train,yc_test = train_test_split(Xc,yc,test_size=0.2,random_state=0)

In [11]:
rf = RandomForestClassifier()


In [None]:
#voting classifier
voting_clf = VotingClassifier(estimators=[('lr',LogisticRegression() ), ('rf', RandomForestClassifier()), ('KNN', KNeighborsClassifier())], voting='hard')
voting_clf.fit(Xc_train, yc_train)
yc_pred = voting_clf.predict(Xc_test)
print("\n_______________________________________________________\n")
print(voting_clf.__class__.__name__, accuracy_score(Y_test, y_pred))
print(f'the accuracy score is:{accuracy_score(yc_test, yc_pred)}')
print(f'the f1-score is:{f1_score(yc_test, yc_pred)}')
print(f'the recall-score is:{recall_score(yc_test, yc_pred)}')
print(f'the confusion_matrix is:{confusion_matrix(yc_test, yc_pred)}')
tn, fp, fn, tp = confusion_matrix(yc_test, yc_pred).ravel()
print(f'Specificity is:',tn / (tn+fp))
print(f'Precision is:',tp / (tp+fp))
print("\n_______________________________________________________\n")

ValueError: y should be a 1d array, got an array of shape (77, 2) instead.

In [None]:
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [None]:
## grid search for best parameters
model_g=GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid={'n_neighbors':[1,2,3,4,5,6,7,8,9,10]},
    cv=5
    )

In [None]:
model_g.fit(Xc_train,yc_train)

Traceback (most recent call last):
  File "C:\Users\umair\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\umair\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\metrics\_scorer.py", line 430, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "C:\Users\umair\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py", line 638, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
  File "C:\Users\umair\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\utils\_param_validation.py", line 192, in w

In [None]:
pd.DataFrame(model_g.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004598,0.0008,0.004997,0.000631,1,{'n_neighbors': 1},,,,,,,,1
1,0.005797,0.002785,0.021788,0.021661,2,{'n_neighbors': 2},,,,,,,,1
2,0.012392,0.008519,0.008196,0.00515,3,{'n_neighbors': 3},,,,,,,,1
3,0.004598,0.001743,0.026386,0.017852,4,{'n_neighbors': 4},,,,,,,,1
4,0.004797,0.000979,0.009395,0.008862,5,{'n_neighbors': 5},,,,,,,,1
5,0.030782,0.02459,0.012992,0.007344,6,{'n_neighbors': 6},,,,,,,,1
6,0.010194,0.002924,0.015791,0.017107,7,{'n_neighbors': 7},,,,,,,,1
7,0.011393,0.00634,0.012592,0.00467,8,{'n_neighbors': 8},,,,,,,,1
8,0.008595,0.007223,0.022986,0.028353,9,{'n_neighbors': 9},,,,,,,,1
9,0.003398,0.000489,0.028984,0.04507,10,{'n_neighbors': 10},,,,,,,,1


In [105]:
from xgboost import XGBClassifier
#import decision tree classifier
from sklearn.tree import DecisionTreeClassifier
#import adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
model=MultiOutputClassifier((DecisionTreeClassifier()))

In [111]:
from sklearn.model_selection import GridSearchCV
param_grid = [
 {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
 {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
 ]
forest_reg = RandomForestClassifier()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
 scoring='neg_mean_squared_error',
 return_train_score=True)
grid_search.fit(Xc_train,yc_train)
# grid_search.best_params_
# grid_search.best_estimator_
# grid_search.best_score_
model.predict(Xc_test)
model.score(x_orignal_test,y_orignal_test)

Traceback (most recent call last):
  File "c:\Users\AbdulRaheemShahzad\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\AbdulRaheemShahzad\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_scorer.py", line 219, in __call__
    return self._score(
  File "c:\Users\AbdulRaheemShahzad\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_scorer.py", line 267, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "c:\Users\AbdulRaheemShahzad\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py", line 442, in mean_squared_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "c:\Users\AbdulRaheemShahzad\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\metrics\_regression.py", line 101, in _check_reg_target

0.85

In [113]:
grid_search.best_estimator_

In [114]:
grid_search.best_params_

{'max_features': 2, 'n_estimators': 3}

In [106]:
model.fit(Xc_train,yc_train)
model.predict(Xc_test)

array([['ESFP', 'B'],
       ['ENFP', 'A'],
       ['ISTJ', 'D'],
       ...,
       ['INTJ', 'C'],
       ['ENTP', 'B'],
       ['ESFJ', 'B']], dtype=object)

In [115]:
df2.shape

(97097, 10)

In [107]:
#accuracy score
model.score(x_orignal_test,y_orignal_test)

0.85

In [50]:
model.score(yc_test,model.predict(X))

Feature names unseen at fit time:
- MBTI
- POSTURE
Feature names seen at fit time, yet now missing:
- E
- F
- I
- J
- N
- ...



ValueError: could not convert string to float: 'ESTP'