In [0]:
%run ../plots

In [0]:
%run ../preprocess

In [0]:
BASE_FILE_NAME = 'supervised_survey'
TO_SAVE = False

In [0]:
df_with_target = spark.read.parquet(
    f"s3a://{S3_PROCESS_PATH}{BASE_FILE_NAME}.parquet"
).cache()

Out[365]: {'EdLevel': {'Primary/elementary school': 1.0,
  'Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)': 2.0,
  'Associate degree (A.A., A.S., etc.)': 3.0,
  'Some college/university study without earning a degree': 4.0,
  'Something else, Professional degree (JD, MD, etc.)': 5.0,
  'Bachelor’s degree (B.A., B.S., B.Eng., etc.)': 6.0,
  'Master’s degree (M.A., M.S., M.Eng., MBA, etc.)': 7.0,
  'Other doctoral degree (Ph.D., Ed.D., etc.)': 8.0},
 'Age1stCode': {'Younger than 5 years': 1.0,
  '5 - 10 years': 2.0,
  '11 - 17 years': 3.0,
  '18 - 24 years': 4.0,
  '25 - 34 years': 5.0,
  '35 - 44 years': 6.0,
  '45 - 54 years': 7.0,
  '55 - 64 years': 8.0,
  'Older than 64 years': 9.0},
 'OrgSize': {'Just me - I am a freelancer, sole proprietor, etc.': 1.0,
  '2 to 9 employees': 2.0,
  '10 to 19 employees': 3.0,
  '20 to 99 employees': 4.0,
  '100 to 499 employees': 5.0,
  'I don’t know': 6.0,
  '500 to 999 employees': 7.0,
  '1,000 to 4,999 employee

In [0]:
# Run full preprocess_pipeline and save the result
if TO_SAVE:
    preprocess_pipeline(df=df_with_target)

Split the following columns: 
 ['DevType', 'LanguageHaveWorkedWith', 'LanguageWantToWorkWith', 'DatabaseHaveWorkedWith', 'DatabaseWantToWorkWith', 'PlatformHaveWorkedWith', 'PlatformWantToWorkWith', 'WebframeHaveWorkedWith', 'WebframeWantToWorkWith', 'MiscTechHaveWorkedWith', 'MiscTechWantToWorkWith', 'ToolsTechHaveWorkedWith', 'ToolsTechWantToWorkWith', 'NEWCollabToolsHaveWorkedWith', 'NEWCollabToolsWantToWorkWith', 'NEWStuck']


In [0]:
# Run preprocess pipeline step by step

In [0]:
step_1 = group_low_freq_cols(df=df_with_target)

#####`Running the one-way anova test (analysis of variance) between 'ConvertedCompYearly' and 'Country', 'Currency', 'Ethnicity'`
#####`Assumption(H0) is that 'Country'/ 'Currency'/ 'Ethnicity' and 'ConvertedCompYearly' are NOT correlated to 'ConvertedCompYearly'

#### Normality Assumption Check
##### Before we perform the hypothesis test, we check if the assumptions for the one-way ANOVA hypothesis test are fulfilled. We check the normality assumption by plotting a normal probability plot (Q-Q plots) for each grouped variable.

In [0]:
plot_multiple_qq_plots(df=step_1)

#### What to do if this assumption is violated:
##### In general, a one-way ANOVA is considered to be fairly robust against violations of the normality assumption as long as the sample sizes are sufficiently large.
##### In our case, the threshold default for frequency of a category is above 1% of the sample, which is above 450. 
### Also, we will replace the outliers with low and high bounderies.

In [0]:
step_2 = floor_and_cap_outliers(df=step_1)

In [0]:
plot_multiple_qq_plots(df=step_2)

##### Looks better with outliers low and high bounderies.

In [0]:
run_multiple_anova_tests(df=step_2)

In [0]:
step_3 = convert_years_cols(df=step_2)

In [0]:
num_cols = [column[0] for column in step_3.dtypes if column[1] in ("int", "double")]
print(f"Numeric columns: \n {num_cols}")

##### Check correlation between 'YearsCode' variable and 'YearsCodePro' variable

In [0]:
years_corr = Plot(
    df=step_3,
    col_name="YearsCodePro",
    title="Correlation between the YearsCode variable and YearsCodePro variable",
)

In [0]:
years_corr.plot_correlation(cols=['YearsCode', 'YearsCodePro'])

##### We can see that the 2 variables are highly correlated. We may want to drop one of them.
##### Let's check which variable is more correlated with the target.

In [0]:
corr_plot = Plot(
    df=step_3["YearsCode", "YearsCodePro", "ConvertedCompYearly"],
    col_name="ConvertedCompYearly",
    title="Pearson correlation with the target",
)

In [0]:
corr_plot.plot_correlation()

##### Suprisingly, both variables are not highly correlated with the target.

In [0]:
step_4 = set_bins_for_years_cols(df=step_3)

In [0]:
step_4.printSchema()

In [0]:
step_5 = replace_to_null(df=step_4) 

In [0]:
step_6 = map_ordinal_cols(df=step_5, mapping_dict=ORDINAL_MAPPING)

In [0]:
display(step_6)

<span style="color:blue">**The function below splits a category type column to many columns of the categories with the values of 1 or 0 in each.  So why not just using built in OneHotEncoder method of sklearn.preprocessing module?  Because the categorical columns in this dataset incloudes thousends of combinations of categorical variables with A LOT of low-represented class
Using OneHotEncoder directly, will create a lot of variables.** </span>

In [0]:
# Split DevType column.  
step_7 = split_a_column(df=step_6, col_name='DevType')

In [0]:
print(f'Split the following columns: \n {COLS_TO_SPLIT}')

In [0]:
step_7 = split_multiple_columns(df=step_6, cols=COLS_TO_SPLIT)

In [0]:
# Drop columns to avoid data leakage: CopmTotal, CompFreq
print("Drop columns: ", COLS_TO_DROP)

In [0]:
pre_processed_df = step_7.drop(*list(chain(COLS_TO_DROP, COLS_TO_SPLIT)))

In [0]:
display(pre_processed_df)

In [0]:
pre_processed_df.printSchema()