# Classifier

## Load Library

In [50]:
import nltk
import preprocess
import re

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

## Preprocessing

In [29]:
# Data Load
fake_data = pd.read_csv('dataset/Fake.csv')
true_data = pd.read_csv('dataset/True.csv')

In [30]:
# Labeling
fake_data['is_fake'] = 1
true_data['is_fake'] = 0

In [31]:
df = pd.concat([fake_data, true_data])

In [36]:
print(len(df))
df.head()

44898


Unnamed: 0,title,text,subject,date,is_fake
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [33]:
def data_preprocessing(text):
    '''
    data cleaning
    '''
    # punctuation 제거
    text = re.sub("[^-9A-Za-z ]", "" , text)
    text=re.sub("<.*?>", ' ', text) 

    # 소문자 변환
    text = text.lower()
    
    #Tokenization
    tokens = nltk.tokenize.word_tokenize(text)
    
    # stopword 제거
    stopwords = nltk.corpus.stopwords.words('english')
    ret = []
    for w in tokens:
        if w not in stopwords:
            ret.append(w)
    tokens = ret

    # Lemmatization & Stemming
    ps = nltk.PorterStemmer()
    text = [ps.stem(word) for word in tokens]
    
    return tokens

In [37]:
df['title_token'] = df['title'].apply(data_preprocessing)
df['text_token'] = df['text'].apply(data_preprocessing)
df.head()

Unnamed: 0,title,text,subject,date,is_fake,title_token,text_token
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1,"[donald, trump, sends, embarrassing, new, year...","[donald, trump, wish, americans, happy, new, y..."
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1,"[drunk, bragging, trump, staffer, started, rus...","[house, intelligence, committee, chairman, dev..."
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1,"[sheriff, david, clarke, becomes, internet, jo...","[friday, revealed, former, milwaukee, sheriff,..."
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1,"[trump, obsessed, even, obamas, name, coded, w...","[christmas, day, donald, trump, announced, wou..."
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1,"[pope, francis, called, donald, trump, christm...","[pope, francis, used, annual, christmas, day, ..."


In [49]:
def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [57]:
 df['jacc_sim'] = df[['title_token','text_token']].apply(lambda x1, x2: jaccard_similarity(x1,x2))

TypeError: <lambda>() missing 1 required positional argument: 'x2'

## Feature Settings
1. 내용과 제목간 자카드 유사도
