NOTE: I have to read my csv file from a specific directory in my laptop, so if you're about to test this code, read the csv file either regularly or from a directory. This code does work and the I have tested it multiple times. 

In [56]:
import logging
import numpy as np
import pandas as pd
import nltk
import re
import emoji
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

def main():
    df = problem1()
    problem3(df)
    results = problem3(df)
    print(results)

def problem1():
    df = pd.read_csv("/Users/revan/Downloads/corona_fake.csv")
    # replace empty text    
    df = df.fillna('')
    df["clean_text"] = df["text"].map(clean_text)
    return df
    
def clean_text(text_str):
    text_str = remove_emojis(text_str) #removes emojis
    text_str = re.sub(r"http\S+", "", text_str) #removes links
    text_str = re.sub(r'[^\w\s]', '', text_str) #removes punctuation
    tokens = nltk.word_tokenize(text_str)
    tagged = nltk.pos_tag(tokens)
    lmtzr = WordNetLemmatizer()
    lemmatized_sentence = []
    #wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), tagged))
    for word, tag in tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:       
            # else use the tag to lemmatize the token
            #print(word)
            lemmatized_sentence.append(lmtzr.lemmatize(word, pos_tagger(tag)))
    lemmatized_sentence = " ".join(lemmatized_sentence)
    stop_words = set(stopwords.words('english'))
    word_tokens = nltk.word_tokenize(lemmatized_sentence)
    filtered_sentence = []
    for w in word_tokens:
        if len(w) <= 2 or re.match(r'\d+',  w): continue
        if w not in stop_words:
            filtered_sentence.append(w)
    return ' '.join(filtered_sentence)

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN

    
def remove_emojis(text_str):
    return ''.join(c for c in text_str if c not in emoji.UNICODE_EMOJI)


2) N-Grams are words, or combinations of words, broken out by the number of words in that combination. It starts off with unigrams (one word), bigrams (two words), trigrams (three words), etc. The words must follow sequentially to be an n-gram. According to Seerinteractive, N-Grams are useful for turning written language into data, and breaking down larger portions of search data into more meaningful segments that help to identify the root cause behind trends. 

In [57]:
def problem2(df):    
    clean_text = df['clean_text'].tolist()
    vectorizer1 = CountVectorizer(ngram_range=(1,1), lowercase= True)
    vectorizer2 = CountVectorizer(ngram_range=(1,2), lowercase= True)
    vectorizer3 = CountVectorizer(ngram_range=(1,3), lowercase= True)
    cv1 = vectorizer1.fit_transform(clean_text)
    cv2 = vectorizer2.fit_transform(clean_text)
    cv3 = vectorizer3.fit_transform(clean_text)
    tfidf1 = TfidfVectorizer(ngram_range=(1,1), lowercase= True)
    tfidf2 = TfidfVectorizer(ngram_range=(1,2), lowercase= True)
    tfidf3 = TfidfVectorizer(ngram_range=(1,3), lowercase= True)
    tf1 = tfidf1.fit_transform(clean_text)
    tf2 = tfidf2.fit_transform(clean_text)
    tf3 = tfidf3.fit_transform(clean_text)
    return (cv1, cv2, cv3, tf1, tf2, tf3)
    

In [58]:
def problem3(df):
    y = df['label'].map({'fake': 0, 'true': 1}).tolist() 
    (cv1, cv2, cv3, tf1, tf2, tf3) = problem2(df)
    #print(logistic_regression(cv1, y))
    results = []
    I = 1
    for cv in cv1, cv2, cv3:    
        results.append([f"CV{I}" ,  logistic_regression(cv, y) ])        
        I += 1
    
    I = 1
    for tf in tf1, tf2, tf3:    
        results.append([f"TF{I}" ,  logistic_regression(tf, y) ])
        I += 1
    return pd.DataFrame(results, columns=['label', 'accuracy'])    
    
    
def logistic_regression(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, train_size=0.70, random_state = 265)
    clf = LogisticRegressionCV(cv = 5, random_state = 265, max_iter  =1000, n_jobs = -1).fit(x_train, y_train)
    return clf.score(x_test, y_test)
    
    
    

In [59]:
main()

  label  accuracy
0   CV1  0.913793
1   CV2  0.916667
2   CV3  0.913793
3   TF1  0.925287
4   TF2  0.905172
5   TF3  0.902299


4) For each function, explain in around 100 words what they mean; specifically

newton-cg -  A newton method. Newton methods use an exact Hessian matrix. It's slow for large datasets, because it computes the second derivatives. A Hessian matrix  is a square matrix of second-order partial derivatives of a scalar-valued function, or scalar field. It describes the local curvature of a function of many variables. It only support L2 regularization or no regularization

lbfgs — Stands for Limited-memory Broyden–Fletcher–Goldfarb–Shanno. It approximates the second derivative matrix updates with gradient evaluations. It stores only the last few updates, so it saves memory. It isn't super fast with large data sets. It only support L2 regularization or no regularization.

liblinear — Library for Large Linear Classification. Uses a coordinate descent algorithm. Coordinate descent is based on minimizing a multivariate function by solving univariate optimization problems in a loop. In other words, it moves toward the minimum in one direction at a time. It is the default solver for Scikit-learn versions earlier than 0.22.0. It performs pretty well with high dimensionality. It does have a number of drawbacks. It can get stuck, is unable to run in parallel, and can only solve multi-class logistic regression with one-vs.-rest.

sag — solver uses Stochastic Average Gradient descent. A variation of gradient descent and incremental aggregated gradient approaches that uses a random sample of previous gradient values. Fast for big datasets. It only support L2 regularization or no regularization

saga — The SAGA solver is a variant of SAG that also supports the non-smooth penalty L1 option (i.e. L1 Regularization). This is therefore the solver of choice for sparse multinomial logistic regression and it’s also suitable for very Large dataset. Should generally train faster than sag. 