In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import model_selection, svm
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from scipy.stats import sem
from numpy import mean
from numpy import std

In [2]:
all_df = pd.read_csv('roman_urdu_all_data.csv')
all_df = all_df.astype(str)
print(all_df.shape)
y = all_df['Label']
y = y.astype(float)
y = y.astype(int)

(9526, 8)


In [3]:
bow_converter = CountVectorizer()
bow_x = bow_converter.fit_transform(all_df['text'])
words = bow_converter.get_feature_names_out()
print(len(words))
print(bow_x.shape)

28606
(9526, 28606)


In [4]:
bow_converter = CountVectorizer()
bow_x = bow_converter.fit_transform(all_df['text'])
words = bow_converter.get_feature_names_out()
print(len(words))
print(bow_x.shape)

28606
(9526, 28606)


In [5]:
tfidf_transform = TfidfTransformer(norm=None)
tfidf_x = tfidf_transform.fit_transform(bow_x)
print(tfidf_x.shape)

(9526, 28606)


In [6]:
bigram_converter = CountVectorizer(ngram_range=[1,2])
bigram_x = bigram_converter.fit_transform(all_df['text'])
print(bigram_x.shape)

(9526, 134987)


In [7]:
def evaluate_model_NB(X, y, repeats, eval_metric):
    cv = RepeatedKFold(n_splits=5, n_repeats=repeats, random_state=1)
    model = MultinomialNB()
    scores = cross_val_score(model, X, y, scoring= eval_metric, cv=cv, n_jobs=2)
    print(f'{eval_metric} for NB mean={mean(scores)} se={sem(scores)}')
    return scores
def evaluate_model_logistic_regression(X, y, repeats, eval_metric):
    cv = RepeatedKFold(n_splits=5, n_repeats=repeats, random_state=1)
    model = LogisticRegression(max_iter = 1000)
    scores = cross_val_score(model, X, y, scoring= eval_metric, cv=cv, n_jobs=2)
    print(f'{eval_metric} for Logistic Regression mean={mean(scores)} se={sem(scores)}')
    return scores
def evaluate_model_svm(X, y, repeats, eval_metric):
    cv = RepeatedKFold(n_splits=5, n_repeats=repeats, random_state=1)
    model = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    scores = cross_val_score(model, X, y, scoring= eval_metric, cv=cv, n_jobs=2)
    print(f'{eval_metric} for SVM mean={mean(scores)} se={sem(scores)}')
    return scores

In [8]:
def get_scores(data):
    logistic_regression = evaluate_model_logistic_regression(data, y, 8,'f1')
    NB = evaluate_model_NB(data, y, 8 , 'f1')
    svm = evaluate_model_svm(data, y, 8 , 'f1')
    return{
        'logistic regression' : logistic_regression,
        'NB' : NB,
        'svm' : svm
    }

In [9]:
get_scores(bigram_x)

f1 for Logistic Regression mean=0.7340907524289333 se=0.0023252072659527626
f1 for NB mean=0.5874747194129605 se=0.002987822875312841
f1 for SVM mean=0.7181816103386417 se=0.002413657231323022


{'logistic regression': array([0.72299169, 0.71759891, 0.75466284, 0.7416332 , 0.74033149,
        0.728     , 0.72131148, 0.73264402, 0.72682324, 0.72899729,
        0.72827586, 0.7369863 , 0.73224044, 0.75      , 0.73184358,
        0.73624824, 0.73841962, 0.73954116, 0.72150072, 0.74054759,
        0.73446328, 0.71564626, 0.77470356, 0.71888112, 0.72777778,
        0.75      , 0.73626374, 0.73726542, 0.73698264, 0.73185185,
        0.7563249 , 0.7700831 , 0.6920904 , 0.72167832, 0.71657754,
        0.73254282, 0.74366197, 0.72984441, 0.73082287, 0.73557047]),
 'NB': array([0.57581967, 0.60081466, 0.58638743, 0.5782652 , 0.59899497,
        0.60441767, 0.53893443, 0.59201774, 0.58666667, 0.61459404,
        0.58666667, 0.60931174, 0.56306761, 0.60385005, 0.58738366,
        0.6       , 0.5891947 , 0.57311089, 0.55150884, 0.60433071,
        0.57082896, 0.60587639, 0.59587021, 0.5932914 , 0.5786802 ,
        0.59340659, 0.599182  , 0.60913706, 0.562     , 0.56377278,
        0.6147704

In [10]:
get_scores(bow_x)

f1 for Logistic Regression mean=0.7047838770870992 se=0.002727899820213779
f1 for NB mean=0.5536845856073291 se=0.0030419303459276307
f1 for SVM mean=0.6924206735784356 se=0.002997989400467908


{'logistic regression': array([0.71041369, 0.685633  , 0.71150972, 0.72447552, 0.70791367,
        0.71369295, 0.68923077, 0.72782875, 0.68983269, 0.69153515,
        0.69727403, 0.69714286, 0.70014347, 0.70438472, 0.71597633,
        0.69577875, 0.72389127, 0.73626374, 0.67173252, 0.72334683,
        0.71281296, 0.69985775, 0.72676056, 0.67660209, 0.70487106,
        0.73136428, 0.69411765, 0.69101124, 0.70521862, 0.69135802,
        0.73566434, 0.72459499, 0.66469719, 0.68529412, 0.69832402,
        0.68984701, 0.71659325, 0.70254111, 0.70911722, 0.71270718]),
 'NB': array([0.53865979, 0.55729167, 0.56878307, 0.55737705, 0.5707196 ,
        0.55031847, 0.51052632, 0.56      , 0.57344301, 0.59037711,
        0.57777778, 0.55729167, 0.53768844, 0.54780362, 0.55526316,
        0.55851064, 0.53896962, 0.55745721, 0.51587302, 0.57428215,
        0.55080214, 0.57252888, 0.54956085, 0.54188482, 0.53055917,
        0.56091371, 0.56185567, 0.55483871, 0.53571429, 0.53150685,
        0.5842985

In [11]:
get_scores(tfidf_x)

f1 for Logistic Regression mean=0.6838699521965543 se=0.0034661219436301154
f1 for NB mean=0.4895480239414892 se=0.002283689991901065
f1 for SVM mean=0.6241281758923295 se=0.0035647320585654673


{'logistic regression': array([0.71428571, 0.67597765, 0.69653179, 0.68917018, 0.6741573 ,
        0.7065073 , 0.64371257, 0.71240876, 0.66223404, 0.64957265,
        0.69637883, 0.67683773, 0.68775791, 0.6751773 , 0.67048711,
        0.67625899, 0.69121813, 0.71718539, 0.64739884, 0.688     ,
        0.70144928, 0.68619247, 0.72554348, 0.65616046, 0.69198312,
        0.72086721, 0.6512301 , 0.68673051, 0.68150209, 0.67368421,
        0.72451791, 0.68489209, 0.64957265, 0.64927536, 0.6875    ,
        0.69115646, 0.69142857, 0.67067669, 0.69252874, 0.6866485 ]),
 'NB': array([0.49701493, 0.50408922, 0.48105182, 0.48802395, 0.48845867,
        0.48675734, 0.47344461, 0.47535771, 0.49819495, 0.51345119,
        0.47725578, 0.49963262, 0.48134044, 0.49454545, 0.49402985,
        0.48976497, 0.4887218 , 0.50668648, 0.44954128, 0.49889949,
        0.45914397, 0.50518519, 0.50700074, 0.49319213, 0.49107143,
        0.50702143, 0.48970252, 0.49315068, 0.48937729, 0.47194466,
        0.4948755

In [12]:
from sklearn.metrics import accuracy_score
baseline_pos = [1 for i in range(len(y))]
print(f'f1 score if all predicted are positive {accuracy_score(y, baseline_pos)}')

f1 score if all predicted are positive 0.18780180558471551
