# **Loading Dataset from Kaggle to Dataframe**

### **A. Load Python Questions from Stack Overflow Kaggle Datasets**

In [1]:
! pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.13.tar.gz (63 kB)
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py): started
  Building wheel for kaggle (setup.py): finished with status 'done'
  Created wheel for kaggle: filename=kaggle-1.5.13-py3-none-any.whl size=77733 sha256=ee855734f23e73f89d41fcc00ff4cb874a063dc0f373d93ae422c134fcc44e43
  Stored in directory: c:\users\stuti\appdata\local\pip\cache\wheels\9c\45\15\6d6d116cd2539fb8f450d64b0aee4a480e5366bb11b42ac763
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.5.13


In [9]:
mkdir ~/.kaggle

The syntax of the command is incorrect.


In [3]:
! cp kaggle.json ~/.kaggle/

'cp' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
! chmod 600 ~/.kaggle/kaggle.json

'chmod' is not recognized as an internal or external command,
operable program or batch file.


In [5]:
! kaggle datasets download stackoverflow/pythonquestions

Traceback (most recent call last):
  File "C:\Users\Stuti\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Stuti\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\Stuti\anaconda3\Scripts\kaggle.exe\__main__.py", line 4, in <module>
  File "C:\Users\Stuti\anaconda3\lib\site-packages\kaggle\__init__.py", line 23, in <module>
    api.authenticate()
  File "C:\Users\Stuti\anaconda3\lib\site-packages\kaggle\api\kaggle_api_extended.py", line 164, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in C:\Users\Stuti\.kaggle. Or use the environment method.


In [6]:
! unzip pythonquestions.zip

'unzip' is not recognized as an internal or external command,
operable program or batch file.


### **B. Importing Libraries**

In [7]:
import numpy as np
import pandas as pd
import string
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import matplotlib.pyplot as plt

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Stuti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Stuti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### **C. Reading and Displaying Datasets**

In [8]:
questions = pd.read_csv('Questions.csv', encoding='latin')
answers = pd.read_csv('Answers.csv', encoding='latin')
tags = pd.read_csv('Tags.csv', encoding='latin')

FileNotFoundError: [Errno 2] No such file or directory: 'Questions.csv'

In [None]:
questions.head()

In [None]:
answers.head()

In [None]:
tags.head()

# **Data Preprocessing**

### **A. Merging The Datasets**

#### **1. 'Questions' Dataset**

In [None]:
# Rename column names
questions.columns=['Id', 'OwnerUserId',	'CreationDate',	'Score',	'Title', 'Question']

In [None]:
questions.head()

#### **2. 'Answers' Dataset**

In [None]:
# Dropping unecessary columns
answers.drop(columns=['Id', 'OwnerUserId', 'CreationDate'], inplace=True)

# Rename column names
answers.columns=['Id', 'Score', 'Answer']

In [None]:
# Joining answers grouped by 'Id'
grouped_answers = answers.groupby('Id')['Answer'].apply(lambda answer: ' '.join(answer))
grouped_answers = grouped_answers.to_frame().reset_index()

In [None]:
grouped_answers.head()

#### **3. 'Tags' Dataset**

In [None]:
# Changing the data type of 'Tag' column to string
tags['Tag']= tags['Tag'].astype(str)

# Joining tags grouped by 'Id'
grouped_tags = tags.groupby('Id')['Tag'].apply(lambda tag: ' '.join(tag))
grouped_tags = grouped_tags.to_frame().reset_index()

In [None]:
grouped_tags.head()

#### **4. Merging to a Single Dataset**

In [None]:
# Merging 'Questions' dataframe with 'Answers' dataframe, then with 'Tags' dataframe 
df = questions.merge(grouped_answers, how='left', on='Id')
df = df.merge(grouped_tags, how='left', on='Id')

In [None]:
df.head()

In [None]:
df.drop(columns=['Id', 'OwnerUserId', 'CreationDate'], inplace=True)

In [None]:
df.head()

### **B. Filtering Dataset Based on 'Score' Column and Most Frequently Used Tags**

In [None]:
# Renaming column names
df.columns = ['score', 'title', 'question', 'answer', 'tag']

In [None]:
df.head()

In [None]:
# Creating 'tag_count' column
temp_df = df.groupby('tag')['tag'].count()
temp_df = temp_df.to_frame()
temp_df.columns = ['tag_count']
temp_df = temp_df.reset_index()

In [None]:
temp_df.sort_values('tag_count', ascending=False).head()

In [None]:
# Merging created column to the existing dataframe
df= pd.merge(df, temp_df, how='left', on='tag')

In [None]:
df.head()

In [None]:
df.head(10)

In [None]:
df = df[(df['tag_count'] >= 1000) & (df['score'] > 3)]

In [None]:
df.shape

In [None]:
df.sort_values('tag_count', ascending=False)

### **C. Cleaning The Data**

In [None]:
df.isnull().sum()

In [None]:
# Dropping 'answer' column since it can't be imputed (because this column values is neither categorical nor continuous in nature)
df.drop(columns=['answer'], inplace=True)

In [None]:
# Defining a function to remove punctuation
def punctuation_remover(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [None]:
string.punctuation

In [None]:
# Changing the data type of 'title' column to string
df['title'] = df['title'].astype(str)

# Applying 'punctuation_remover' function on 'title' column
df['title'] = df['title'].apply(punctuation_remover)

# Changing text into lowercase
df['title'] = df['title'].str.lower()

# Splitting the text into words
df['title'] = df['title'].str.split()

In [None]:
# Changing the data type of 'question' column to string
df['question'] = df['question'].astype(str)

# Removing HTML tags on 'question' column values
df['question'] = df['question'].apply(lambda question: re.sub('<[^<]+?>', '', question))

# Applying 'punctuation_remover' function on 'question' column
df['question'] = df['question'].apply(punctuation_remover)

# Changing text into lowercase
df['question'] = df['question'].str.lower()

# Splitting the text into words
df['question'] = df['question'].str.split()

In [None]:
df['title'].head()

In [None]:
df['question'].head()

### **D. Lemmatization**

In [None]:
lematizer = WordNetLemmatizer()

# Defining lemmatizer function
def word_lemmatizer(text):
    lemma_text = [lematizer.lemmatize(word) for word in text]
    return lemma_text

In [None]:
# Applying lemmatizer function to 'title' and 'answer' columns
df['title'] = df['title'].apply(lambda title: word_lemmatizer(title))
df['question'] = df['question'].apply(lambda question: word_lemmatizer(question))

### **E. Removing Stopword**

In [None]:
df['title'] = df['title'].apply(lambda title: [word for word in title if word not in stopwords.words('english')])
df['question'] = df['question'].apply(lambda question: [word for word in question if word not in stopwords.words('english')])

In [None]:
stopwords.words('english')

In [None]:
df['title'].head()

In [None]:
df['question'].head()

In [None]:
df.drop(columns=['score', 'tag_count'], inplace=True)

In [None]:
df.title[12]

### **F. TF-IDF Vectorization**

In [None]:
'''
TF-IDF equation:
TF = (# of times of specific word in a doc) / (# of words in doc)
IDF = log((# of docs) / (# of docs that contains specific word))
TF-IDF = TF * IDF
'''

vectorizer = TfidfVectorizer()

# Changing the data type of 'title' and 'question' columns to string
df['title'] = df['title'].astype(str)
df['question'] = df['question'].astype(str)

X1 = vectorizer.fit_transform(df['title'].str.lower())
X2 = vectorizer.fit_transform(df['question'].str.lower())

In [None]:
print(X2)

In [None]:
label_encoder = LabelEncoder() 

df['tag'] = label_encoder.fit_transform(df['tag'])
y = df['tag'].values

# **Splitting The Dataset to Train and Test Sets**

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X2, y, test_size=0.3, random_state=42)

In [None]:
accuracy = []

for i in range(1, 40):
    KNN = KNeighborsClassifier(n_neighbors = i).fit(x_train, y_train)
    prediction = KNN.predict(x_test)
    accuracy.append(metrics.accuracy_score(y_test, prediction))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, 40), accuracy, color = 'blue', linestyle='dashed', 
         marker='o', markerfacecolor='red', markersize=10)
plt.title('Accuracy vs. K Value')
plt.xlabel('K')
plt.ylabel('Accuracy')
print("Maximum Accuracy:", max(accuracy), "at K =", accuracy.index(max(accuracy))+1)