In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib_venn import venn2, venn3, venn2_circles, venn3_circles #to create intersection graphs
import matplotlib.pyplot as plt #to plot show the charts
import seaborn as sns
from scipy import stats

from nltk import word_tokenize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

import os 
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/google-quest-challenge/train.csv
/kaggle/input/google-quest-challenge/test.csv
/kaggle/input/google-quest-challenge/sample_submission.csv


In [2]:
df_train = pd.read_csv("../input/google-quest-challenge/train.csv")
df_test = pd.read_csv("../input/google-quest-challenge/test.csv")
df_sub = pd.read_csv("../input/google-quest-challenge/sample_submission.csv")

In [3]:
#define function to discover basic nature of our dataframe
def summarizetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [4]:
# summarize the training dataframe
summarizetable(df_train)[:10]

Dataset Shape: (6079, 41)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,qa_id,int64,0,6079,0,1,2,12.57
1,question_title,object,0,3583,What am I losing when using extension tubes in...,What is the distinction between a city and a s...,Maximum protusion length for through-hole comp...,11.64
2,question_body,object,0,3583,After playing around with macro photography on...,I am trying to understand what kinds of places...,I'm working on a PCB that has through-hole com...,11.64
3,question_user_name,object,0,3215,ysap,russellpierce,Joe Baker,11.42
4,question_user_page,object,0,3422,https://photo.stackexchange.com/users/1024,https://rpg.stackexchange.com/users/8774,https://electronics.stackexchange.com/users/10157,11.56
5,answer,object,0,6079,"I just got extension tubes, so here's the skin...",It might be helpful to look into the definitio...,Do you even need grooves? We make several pro...,12.57
6,answer_user_name,object,0,4114,rfusca,Erik Schmidt,Dwayne Reid,11.68
7,answer_user_page,object,0,4430,https://photo.stackexchange.com/users/1917,https://rpg.stackexchange.com/users/1871,https://electronics.stackexchange.com/users/64754,11.85
8,url,object,0,3583,http://photo.stackexchange.com/questions/9169/...,http://rpg.stackexchange.com/questions/47820/w...,http://electronics.stackexchange.com/questions...,11.64
9,category,object,0,5,LIFE_ARTS,CULTURE,SCIENCE,2.14


In [5]:
# summarize the testing dataframe
summarizetable(df_test)[:10]

Dataset Shape: (476, 11)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,qa_id,int64,0,476,39,46,70,8.89
1,question_title,object,0,476,Will leaving corpses lying around upset my pri...,Url link to feature image in the portfolio,"Is accuracy, recoil or bullet spread affected ...",8.89
2,question_body,object,0,476,I see questions/information online about how t...,I am new to Wordpress. i have issue with Featu...,"To experiment I started a bot game, toggled in...",8.89
3,question_user_name,object,0,467,Dylan,Anu,Konsta,8.85
4,question_user_page,object,0,474,https://gaming.stackexchange.com/users/64471,https://wordpress.stackexchange.com/users/72927,https://gaming.stackexchange.com/users/37545,8.89
5,answer,object,0,476,There is no consequence for leaving corpses an...,I think it is possible with custom fields.\n\n...,You do not have armour in the screenshots. Thi...,8.89
6,answer_user_name,object,0,363,Nelson868,Irina,Damon Smithies,8.38
7,answer_user_page,object,0,367,https://gaming.stackexchange.com/users/97324,https://wordpress.stackexchange.com/users/27233,https://gaming.stackexchange.com/users/70641,8.41
8,url,object,0,476,http://gaming.stackexchange.com/questions/1979...,http://wordpress.stackexchange.com/questions/1...,http://gaming.stackexchange.com/questions/2154...,8.89
9,category,object,0,5,CULTURE,TECHNOLOGY,CULTURE,2.09


In [6]:
df_train.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,...,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,...,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,...,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,...,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


In [7]:
df_test.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


In [None]:
df_sub.head()