In [7]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
import plotly.express as px
from sklearn.decomposition import PCA

In [8]:
df = pd.read_csv('https://raw.githubusercontent.com/Edric-Matwiejew/notebook_share_test/main/Dataanalysis_workshop.csv')

In [9]:
df

Unnamed: 0,State,Year of Birth (YYYY),Country of Birth,highest academic degree,Identity as aboriginal/torres strait islander,Citizen/PR,First Language,gender
0,VIC,1997,Germany,Master Degree,No,No,german,woman
1,WA,1972,USA,Master Degree,No,Yes,english,woman
2,NSW,1995,Australia,Bachelor Degree,No,Yes,english,woman
3,WA,1999,Israel,High School Diploma,No,Yes,hebrew,man
4,WA,2000,Australia,High School Diploma,No,Yes,english,man
5,WA,1990,Australia,Master Degree,No,Yes,english,man
6,WA,2001,Germany,High School Diploma,No,Yes,german,man
7,WA,1998,Australia,Bachelor Honours Degree,No,Yes,english,man
8,WA,1992,Australia,High School Diploma,No,Yes,english,man
9,WA,1994,Australia,Bachelor Degree,No,Yes,english,man


In [10]:
le = OrdinalEncoder()
df_transformed = le.fit_transform(df)

In [11]:
le = LabelEncoder()
df_transformed = df.copy()
for col in df_transformed.select_dtypes(include='O').columns:
    df_transformed[col]=le.fit_transform(df_transformed[col])

In [12]:
df_transformed

Unnamed: 0,State,Year of Birth (YYYY),Country of Birth,highest academic degree,Identity as aboriginal/torres strait islander,Citizen/PR,First Language,gender
0,1,1997,1,3,0,0,1,1
1,2,1972,4,3,0,1,0,1
2,0,1995,0,0,0,1,0,1
3,2,1999,3,2,0,1,2,0
4,2,2000,0,2,0,1,0,0
5,2,1990,0,3,0,1,0,0
6,2,2001,1,2,0,1,1,0
7,2,1998,0,1,0,1,0,0
8,2,1992,0,2,0,1,0,0
9,2,1994,0,0,0,1,0,0


In [13]:
clf = IsolationForest(random_state=0).fit_predict(df_transformed)

In [14]:
df['anomaly'] = clf

In [15]:
df

Unnamed: 0,State,Year of Birth (YYYY),Country of Birth,highest academic degree,Identity as aboriginal/torres strait islander,Citizen/PR,First Language,gender,anomaly
0,VIC,1997,Germany,Master Degree,No,No,german,woman,-1
1,WA,1972,USA,Master Degree,No,Yes,english,woman,-1
2,NSW,1995,Australia,Bachelor Degree,No,Yes,english,woman,-1
3,WA,1999,Israel,High School Diploma,No,Yes,hebrew,man,-1
4,WA,2000,Australia,High School Diploma,No,Yes,english,man,1
5,WA,1990,Australia,Master Degree,No,Yes,english,man,1
6,WA,2001,Germany,High School Diploma,No,Yes,german,man,1
7,WA,1998,Australia,Bachelor Honours Degree,No,Yes,english,man,1
8,WA,1992,Australia,High School Diploma,No,Yes,english,man,1
9,WA,1994,Australia,Bachelor Degree,No,Yes,english,man,1


In [16]:
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_transformed)
df['pca-one'] = pca_result[:,0]
df['pca-two'] = pca_result[:,1]
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

Explained variation per principal component: [0.94619359 0.02710328]


In [17]:
df

Unnamed: 0,State,Year of Birth (YYYY),Country of Birth,highest academic degree,Identity as aboriginal/torres strait islander,Citizen/PR,First Language,gender,anomaly,pca-one,pca-two
0,VIC,1997,Germany,Master Degree,No,No,german,woman,-1,-2.636756,1.043027
1,WA,1972,USA,Master Degree,No,Yes,english,woman,-1,22.503981,0.995508
2,NSW,1995,Australia,Bachelor Degree,No,Yes,english,woman,-1,-0.883656,-1.823954
3,WA,1999,Israel,High School Diploma,No,Yes,hebrew,man,-1,-4.556084,2.807837
4,WA,2000,Australia,High School Diploma,No,Yes,english,man,1,-5.752171,-0.262492
5,WA,1990,Australia,Master Degree,No,Yes,english,man,1,4.244009,-0.655778
6,WA,2001,Germany,High School Diploma,No,Yes,german,man,1,-6.687722,1.004995
7,WA,1998,Australia,Bachelor Honours Degree,No,Yes,english,man,1,-3.822396,-0.871759
8,WA,1992,Australia,High School Diploma,No,Yes,english,man,1,2.198466,-0.930861
9,WA,1994,Australia,Bachelor Degree,No,Yes,english,man,1,0.095038,-1.64812


In [18]:
df["anomaly"] = df["anomaly"].astype(str)
fig = px.scatter(df, x="pca-one", y="pca-two", color="anomaly",
                 hover_data=['State', 'Country of Birth', 'highest academic degree', 'Identity as aboriginal/torres strait islander', 'Citizen/PR'],
                 color_discrete_sequence=["indianred", "mediumblue"])

fig.show()