In [177]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sp
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, VarianceThreshold
import plotly.express as px

In [178]:
career_dataset = pd.read_csv("../Datasets/career_pred.csv")

# 1.) Exploratory Data Analysis

In [179]:
career_dataset.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert,Suggested Job Role
0,69,63,78,87,94,94,87,84,61,9,...,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,no,Database Developer
1,78,62,73,60,71,70,73,84,91,12,...,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,yes,Portal Administrator
2,71,86,91,87,61,81,72,72,94,11,...,Travel,Work,no,stubborn,Management,work,hard worker,no,yes,Portal Administrator
3,76,87,60,84,89,73,62,88,69,7,...,Romance,Work,yes,gentle,Management,work,smart worker,yes,yes,Systems Security Administrator
4,92,62,90,67,71,89,73,71,73,4,...,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,yes,Business Systems Analyst


In [180]:
career_dataset.shape

(20000, 39)

In [181]:
career_dataset.columns

Index(['Acedamic percentage in Operating Systems', 'percentage in Algorithms',
       'Percentage in Programming Concepts',
       'Percentage in Software Engineering', 'Percentage in Computer Networks',
       'Percentage in Electronics Subjects',
       'Percentage in Computer Architecture', 'Percentage in Mathematics',
       'Percentage in Communication skills', 'Hours working per day',
       'Logical quotient rating', 'hackathons', 'coding skills rating',
       'public speaking points', 'can work long time before system?',
       'self-learning capability?', 'Extra-courses did', 'certifications',
       'workshops', 'talenttests taken?', 'olympiads',
       'reading and writing skills', 'memory capability score',
       'Interested subjects', 'interested career area ', 'Job/Higher Studies?',
       'Type of company want to settle in?',
       'Taken inputs from seniors or elders', 'interested in games',
       'Interested Type of Books', 'Salary Range Expected',
       'In a Rea

In [182]:
career_dataset.describe()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,Logical quotient rating,hackathons,coding skills rating,public speaking points
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,77.0023,76.9482,77.01755,77.0945,76.9582,77.01555,77.06985,76.9131,76.92145,7.98595,4.99505,2.99265,5.00475,5.0128
std,10.085697,10.101733,10.134815,10.087837,10.020088,10.168888,10.069059,10.138555,10.103494,2.593798,2.578383,2.005791,2.576831,2.588875
min,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,4.0,1.0,0.0,1.0,1.0
25%,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,6.0,3.0,1.0,3.0,3.0
50%,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,8.0,5.0,3.0,5.0,5.0
75%,86.0,86.0,86.0,86.0,85.0,86.0,86.0,86.0,86.0,10.0,7.0,5.0,7.0,7.0
max,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,12.0,9.0,6.0,9.0,9.0


In [183]:
career_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 39 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   Acedamic percentage in Operating Systems  20000 non-null  int64 
 1   percentage in Algorithms                  20000 non-null  int64 
 2   Percentage in Programming Concepts        20000 non-null  int64 
 3   Percentage in Software Engineering        20000 non-null  int64 
 4   Percentage in Computer Networks           20000 non-null  int64 
 5   Percentage in Electronics Subjects        20000 non-null  int64 
 6   Percentage in Computer Architecture       20000 non-null  int64 
 7   Percentage in Mathematics                 20000 non-null  int64 
 8   Percentage in Communication skills        20000 non-null  int64 
 9   Hours working per day                     20000 non-null  int64 
 10  Logical quotient rating                   2000

In [184]:
career_dataset.isnull().sum()

Acedamic percentage in Operating Systems    0
percentage in Algorithms                    0
Percentage in Programming Concepts          0
Percentage in Software Engineering          0
Percentage in Computer Networks             0
Percentage in Electronics Subjects          0
Percentage in Computer Architecture         0
Percentage in Mathematics                   0
Percentage in Communication skills          0
Hours working per day                       0
Logical quotient rating                     0
hackathons                                  0
coding skills rating                        0
public speaking points                      0
can work long time before system?           0
self-learning capability?                   0
Extra-courses did                           0
certifications                              0
workshops                                   0
talenttests taken?                          0
olympiads                                   0
reading and writing skills        

In [185]:
def cats_per_feature(feature):
  print("{}: {}".format(feature,career_dataset[feature].nunique()))

In [186]:
for feature in career_dataset.columns:
  cats_per_feature(feature)

Acedamic percentage in Operating Systems: 35
percentage in Algorithms: 35
Percentage in Programming Concepts: 35
Percentage in Software Engineering: 35
Percentage in Computer Networks: 35
Percentage in Electronics Subjects: 35
Percentage in Computer Architecture: 35
Percentage in Mathematics: 35
Percentage in Communication skills: 35
Hours working per day: 9
Logical quotient rating: 9
hackathons: 7
coding skills rating: 9
public speaking points: 9
can work long time before system?: 2
self-learning capability?: 2
Extra-courses did: 2
certifications: 9
workshops: 8
talenttests taken?: 2
olympiads: 2
reading and writing skills: 3
memory capability score: 3
Interested subjects: 10
interested career area : 6
Job/Higher Studies?: 2
Type of company want to settle in?: 10
Taken inputs from seniors or elders: 2
interested in games: 2
Interested Type of Books: 31
Salary Range Expected: 2
In a Realtionship?: 2
Gentle or Tuff behaviour?: 2
Management or Technical: 2
Salary/work: 2
hard/smart worke

In [187]:
cont_features = list(career_dataset.columns[0:14])

In [188]:
cat_features = career_dataset.iloc[:,:-1].drop(cont_features,axis=1).columns

In [189]:
cont_features

['Acedamic percentage in Operating Systems',
 'percentage in Algorithms',
 'Percentage in Programming Concepts',
 'Percentage in Software Engineering',
 'Percentage in Computer Networks',
 'Percentage in Electronics Subjects',
 'Percentage in Computer Architecture',
 'Percentage in Mathematics',
 'Percentage in Communication skills',
 'Hours working per day',
 'Logical quotient rating',
 'hackathons',
 'coding skills rating',
 'public speaking points']

In [190]:
cat_features

Index(['can work long time before system?', 'self-learning capability?',
       'Extra-courses did', 'certifications', 'workshops',
       'talenttests taken?', 'olympiads', 'reading and writing skills',
       'memory capability score', 'Interested subjects',
       'interested career area ', 'Job/Higher Studies?',
       'Type of company want to settle in?',
       'Taken inputs from seniors or elders', 'interested in games',
       'Interested Type of Books', 'Salary Range Expected',
       'In a Realtionship?', 'Gentle or Tuff behaviour?',
       'Management or Technical', 'Salary/work', 'hard/smart worker',
       'worked in teams ever?', 'Introvert'],
      dtype='object')

In [191]:
len(cont_features) + len(cat_features)

38

In [192]:
cont_career_dataset = career_dataset[cont_features]
cat_career_dataset = career_dataset[cat_features]

In [193]:
cont_career_dataset.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,Logical quotient rating,hackathons,coding skills rating,public speaking points
0,69,63,78,87,94,94,87,84,61,9,4,0,4,8
1,78,62,73,60,71,70,73,84,91,12,7,1,2,3
2,71,86,91,87,61,81,72,72,94,11,1,4,1,3
3,76,87,60,84,89,73,62,88,69,7,1,1,2,5
4,92,62,90,67,71,89,73,71,73,4,5,4,6,3


In [194]:
cat_career_dataset.head()

Unnamed: 0,can work long time before system?,self-learning capability?,Extra-courses did,certifications,workshops,talenttests taken?,olympiads,reading and writing skills,memory capability score,Interested subjects,...,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert
0,yes,yes,yes,shell programming,cloud computing,no,yes,excellent,excellent,cloud computing,...,no,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,no
1,yes,no,yes,machine learning,database security,no,no,poor,medium,networks,...,yes,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,yes
2,yes,no,yes,app development,web technologies,no,yes,poor,excellent,hacking,...,yes,Travel,Work,no,stubborn,Management,work,hard worker,no,yes
3,no,yes,no,python,data science,yes,no,medium,excellent,networks,...,no,Romance,Work,yes,gentle,Management,work,smart worker,yes,yes
4,no,no,no,app development,cloud computing,no,no,poor,excellent,Computer Architecture,...,yes,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,yes


# 2.) Data Cleaning

### 2.a) Scaling Continuous features

In [195]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cont_career_dataset)
cont_career_dataset = pd.DataFrame(scaled_data,columns=cont_career_dataset.columns)

In [196]:
cont_career_dataset.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,Logical quotient rating,hackathons,coding skills rating,public speaking points
0,-0.79345,-1.380807,0.096941,0.98195,1.700806,1.670279,0.986229,0.699022,-1.575875,0.390962,-0.38593,-1.492042,-0.389927,1.153889
1,0.098925,-1.479803,-0.396421,-1.694608,-0.59464,-0.689921,-0.404204,0.699022,1.393469,1.547595,0.777619,-0.993473,-1.166093,-0.7775
2,-0.595145,0.896086,1.37968,0.98195,-1.59266,0.391837,-0.50352,-0.484608,1.690403,1.162051,-1.549479,0.502233,-1.554176,-0.7775
3,-0.099381,0.995082,-1.67916,0.684554,1.201796,-0.394896,-1.496687,1.093566,-0.78405,-0.380128,-1.549479,-0.993473,-1.166093,-0.004944
4,1.487064,-1.479803,1.281008,-1.000686,-0.59464,1.17857,-0.404204,-0.583244,-0.388138,-1.536762,0.00192,0.502233,0.38624,-0.7775


### 2.b) Label encoding categorical features

In [197]:
label_encoder = LabelEncoder()

In [198]:
def encode_feature(feature):
  cat_career_dataset[feature] = lable_encoder.fit_transform(cat_career_dataset[feature])

In [199]:
for feature in cat_career_dataset.columns:
  encode_feature(feature)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [200]:
cat_career_dataset.head()

Unnamed: 0,can work long time before system?,self-learning capability?,Extra-courses did,certifications,workshops,talenttests taken?,olympiads,reading and writing skills,memory capability score,Interested subjects,...,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert
0,1,1,1,8,0,0,1,0,0,4,...,0,21,1,0,1,0,0,0,1,0
1,1,0,1,5,2,0,0,2,1,7,...,1,5,1,1,0,1,0,0,0,1
2,1,0,1,0,7,0,1,2,0,6,...,1,29,0,0,1,0,1,0,0,1
3,0,1,0,6,1,1,0,1,0,7,...,0,23,0,1,0,0,1,1,1,1
4,0,0,0,0,0,0,0,2,0,0,...,1,7,1,0,1,0,1,0,1,1


### 2.c) Feature Selection

In [201]:
X = pd.concat([cont_career_dataset,cat_career_dataset],axis=1)
y = pd.DataFrame(career_dataset.iloc[:,-1],columns=['Suggested Job Role'])

In [202]:
X.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Introvert
0,-0.79345,-1.380807,0.096941,0.98195,1.700806,1.670279,0.986229,0.699022,-1.575875,0.390962,...,0,21,1,0,1,0,0,0,1,0
1,0.098925,-1.479803,-0.396421,-1.694608,-0.59464,-0.689921,-0.404204,0.699022,1.393469,1.547595,...,1,5,1,1,0,1,0,0,0,1
2,-0.595145,0.896086,1.37968,0.98195,-1.59266,0.391837,-0.50352,-0.484608,1.690403,1.162051,...,1,29,0,0,1,0,1,0,0,1
3,-0.099381,0.995082,-1.67916,0.684554,1.201796,-0.394896,-1.496687,1.093566,-0.78405,-0.380128,...,0,23,0,1,0,0,1,1,1,1
4,1.487064,-1.479803,1.281008,-1.000686,-0.59464,1.17857,-0.404204,-0.583244,-0.388138,-1.536762,...,1,7,1,0,1,0,1,0,1,1


In [203]:
y.head()

Unnamed: 0,Suggested Job Role
0,Database Developer
1,Portal Administrator
2,Portal Administrator
3,Systems Security Administrator
4,Business Systems Analyst


In [204]:
y['Suggested Job Role'] = label_encoder.fit_transform(y['Suggested Job Role'])

In [205]:
y.head()

Unnamed: 0,Suggested Job Role
0,7
1,18
2,18
3,28
4,2


In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

#### Seeing if there is any constant column

In [207]:
variance_threshold = VarianceThreshold(threshold=0)
vt_features = variance_threshold.fit(X_train)
vt_features.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

##### Getting the mutual information of the features

In [208]:
mutual_info = mutual_info_classif(X_train,y_train)
mutual_info


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



array([0.        , 0.        , 0.00181873, 0.00356719, 0.00299293,
       0.        , 0.        , 0.00728271, 0.        , 0.00770389,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00250434, 0.00521485, 0.        , 0.00094237,
       0.00122938, 0.0042693 , 0.00513534, 0.00336163, 0.        ,
       0.00469881, 0.        , 0.02077643, 0.00140083, 0.00465269,
       0.00384151, 0.        , 0.00518319, 0.        , 0.        ,
       0.        , 0.        , 0.        ])

In [209]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info

Acedamic percentage in Operating Systems    0.000000
percentage in Algorithms                    0.000000
Percentage in Programming Concepts          0.001819
Percentage in Software Engineering          0.003567
Percentage in Computer Networks             0.002993
Percentage in Electronics Subjects          0.000000
Percentage in Computer Architecture         0.000000
Percentage in Mathematics                   0.007283
Percentage in Communication skills          0.000000
Hours working per day                       0.007704
Logical quotient rating                     0.000000
hackathons                                  0.000000
coding skills rating                        0.000000
public speaking points                      0.000000
can work long time before system?           0.000000
self-learning capability?                   0.000000
Extra-courses did                           0.002504
certifications                              0.005215
workshops                                   0.

In [210]:
mutual_info.sort_values(ascending=False,inplace=True)
mutual_info

Taken inputs from seniors or elders         0.020776
Hours working per day                       0.007704
Percentage in Mathematics                   0.007283
certifications                              0.005215
Gentle or Tuff behaviour?                   0.005183
memory capability score                     0.005135
Job/Higher Studies?                         0.004699
Interested Type of Books                    0.004653
reading and writing skills                  0.004269
Salary Range Expected                       0.003842
Percentage in Software Engineering          0.003567
Interested subjects                         0.003362
Percentage in Computer Networks             0.002993
Extra-courses did                           0.002504
Percentage in Programming Concepts          0.001819
interested in games                         0.001401
olympiads                                   0.001229
talenttests taken?                          0.000942
Management or Technical                     0.

In [211]:
mutual_info_dataframe = pd.DataFrame(mutual_info, index=mutual_info.index, columns=['Mutual Information'])
mutual_info_dataframe

Unnamed: 0,Mutual Information
Taken inputs from seniors or elders,0.020776
Hours working per day,0.007704
Percentage in Mathematics,0.007283
certifications,0.005215
Gentle or Tuff behaviour?,0.005183
memory capability score,0.005135
Job/Higher Studies?,0.004699
Interested Type of Books,0.004653
reading and writing skills,0.004269
Salary Range Expected,0.003842


In [212]:
fig =px.bar(mutual_info_dataframe, x=mutual_info_dataframe.index,y='Mutual Information')
fig.show()

##### Here we see the features which have high infomation

In [213]:
important_features = mutual_info[0:23]

In [214]:
important_features

Taken inputs from seniors or elders    0.020776
Hours working per day                  0.007704
Percentage in Mathematics              0.007283
certifications                         0.005215
Gentle or Tuff behaviour?              0.005183
memory capability score                0.005135
Job/Higher Studies?                    0.004699
Interested Type of Books               0.004653
reading and writing skills             0.004269
Salary Range Expected                  0.003842
Percentage in Software Engineering     0.003567
Interested subjects                    0.003362
Percentage in Computer Networks        0.002993
Extra-courses did                      0.002504
Percentage in Programming Concepts     0.001819
interested in games                    0.001401
olympiads                              0.001229
talenttests taken?                     0.000942
Management or Technical                0.000000
Type of company want to settle in?     0.000000
Salary/work                            0

In [215]:
k_best_selector = SelectKBest(mutual_info_classif,k=22)
k_best_selector.fit(X_train,y_train['Suggested Job Role'])

SelectKBest(k=22,
            score_func=<function mutual_info_classif at 0x000001F760593280>)

In [216]:
k_best_selector.get_support()

array([False,  True, False,  True,  True, False,  True, False, False,
       False,  True, False, False, False,  True,  True,  True, False,
        True,  True, False,  True, False, False,  True, False, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True])

In [217]:
feature_selected_by_kBest = X.columns[k_best_selector.get_support()]
feature_selected_by_kBest

Index(['percentage in Algorithms', 'Percentage in Software Engineering',
       'Percentage in Computer Networks',
       'Percentage in Computer Architecture', 'Logical quotient rating',
       'can work long time before system?', 'self-learning capability?',
       'Extra-courses did', 'workshops', 'talenttests taken?',
       'reading and writing skills', 'interested career area ',
       'Taken inputs from seniors or elders', 'interested in games',
       'Interested Type of Books', 'Salary Range Expected',
       'In a Realtionship?', 'Management or Technical', 'Salary/work',
       'hard/smart worker', 'worked in teams ever?', 'Introvert'],
      dtype='object')

In [218]:
X_train = X_train[important_features.index]

In [219]:
X_train.head()

Unnamed: 0,Taken inputs from seniors or elders,Hours working per day,Percentage in Mathematics,certifications,Gentle or Tuff behaviour?,memory capability score,Job/Higher Studies?,Interested Type of Books,reading and writing skills,Salary Range Expected,...,Extra-courses did,Percentage in Programming Concepts,interested in games,olympiads,talenttests taken?,Management or Technical,Type of company want to settle in?,Salary/work,hard/smart worker,worked in teams ever?
3443,0,-0.765672,-1.372331,0,0,0,1,7,0,0,...,0,1.675697,0,0,0,1,9,0,0,1
1310,1,-1.536762,-1.470966,6,1,1,0,7,0,1,...,1,-1.67916,0,1,0,0,5,1,1,1
3732,1,1.162051,0.896294,7,0,0,0,12,1,0,...,0,-1.087126,0,0,0,1,5,0,1,0
9955,1,-0.765672,-1.569602,1,1,0,0,18,0,1,...,1,-1.67916,0,0,1,0,8,0,0,1
18407,1,-1.151217,-0.090064,3,0,0,0,22,2,1,...,0,0.590302,1,0,1,1,6,0,1,1


In [220]:
X_test = X_test[important_features.index]

In [221]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)

DecisionTreeClassifier()

In [222]:
dt_prediction_train = dt_model.predict(X_train)
dt_prediction_test = dt_model.predict(X_test)

In [223]:
print("On Training Data: ",accuracy_score(y_train,dt_prediction_train))
print("On Test Data: ",accuracy_score(y_test,dt_prediction_test))

On Training Data:  1.0
On Test Data:  0.02893939393939394


In [224]:
# feature_selector = SelectKBest(score_func=chi2,k=11)
# feature_selector.fit(X_train,y_train)
# X_train_fs= feature_selector.transform(X_train)
# X_test_fs= feature_selector.transform(X_test)

In [225]:
# len(feature_selector.scores_)

In [226]:
# career_dataset.shape[1]

In [227]:
import xgboost as xgb

In [228]:
xgb_model = xgb.XGBClassifier()

In [229]:
 
xgb_model.fit(X_train, y_train)
xgb_y_pred  = xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(y_test,xgb_y_pred)
print("accuracy=",xgb_accuracy*100)




A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



accuracy= 3.3030303030303028


In [230]:
from sklearn.ensemble import RandomForestClassifier
rf_model  = RandomForestClassifier()

In [231]:
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_train)
pred1 = rf_model.predict(X_test)
print("Train ",accuracy_score(y_train,pred))
print("Test ",accuracy_score(y_test,pred1))





A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Train  1.0
Test  0.03378787878787879
