**Precision** quantifies the number of *positive class predictions* that actually belong to the positive class.

**Recall** quantifies the number of positive class predictions made out of all positive examples in the dataset.

**F-Measure** provides a single score that balances both the concerns of precision and recall in one number.


For imbalanced classification problems, the majority class is typically referred to as the negative outcome. and 

The minority class is typically referred to as the positive outcome.


**Our case**:

    - Fake news artciles is a positive class (because we have less fake news samples) -- (class 1)
    - Real news articles in a negative class (because we have more real samples) -- (class 0)
    
    
**Precision**

    1. Precision is a metric that quantifies the number of correct positive predictions made.

    2. Precision, therefore, calculates the accuracy for the minority class.

    3. It is calculated as the ratio of correctly predicted positive examples divided by the total number of positive 
    examples that were predicted.
    

In [1]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score  
from sklearn.metrics import accuracy_score

In [746]:
orig=pd.read_excel('Orignal_labels.xlsx')
orig.head()

Unnamed: 0,File No,Real/Fake
0,1,F
1,2,F
2,3,F
3,4,F
4,5,F


In [747]:
# Added a new column to make labels in Numeric form, like F --> 1, R --> 0

orig.insert(2, "label", orig['Real/Fake'], True) 

In [748]:
orig.head()

Unnamed: 0,File No,Real/Fake,label
0,1,F,F
1,2,F,F
2,3,F,F
3,4,F,F
4,5,F,F


In [749]:
#Replace Fake News article label with 1 (F -- > 1)
#Replace Real News article label with 0 (R -- > 0)


orig["label"].replace({"F": 1, "R": 0}, inplace=True)

In [750]:
orig.head(10)

Unnamed: 0,File No,Real/Fake,label
0,1,F,1
1,2,F,1
2,3,F,1
3,4,F,1
4,5,F,1
5,6,R,0
6,7,R,0
7,8,R,0
8,9,R,0
9,10,F,1


In [751]:
act_pos=orig.loc[orig['label'] == 1]

In [752]:
act_pos

Unnamed: 0,File No,Real/Fake,label
0,1,F,1
1,2,F,1
2,3,F,1
3,4,F,1
4,5,F,1
...,...,...,...
367,368,F,1
368,369,F,1
387,388,F,1
388,389,F,1


In [753]:
act_neg=orig.loc[orig['label'] == 0]
act_neg

Unnamed: 0,File No,Real/Fake,label
5,6,R,0
6,7,R,0
7,8,R,0
8,9,R,0
13,14,R,0
...,...,...,...
395,396,R,0
396,397,R,0
397,398,R,0
398,399,R,0


In [754]:
frames = [act_pos, act_neg]

y_true = pd.concat(frames)
#y_true = y_truee.sort_values(by ='File No' )
y_true

Unnamed: 0,File No,Real/Fake,label
0,1,F,1
1,2,F,1
2,3,F,1
3,4,F,1
4,5,F,1
...,...,...,...
395,396,R,0
396,397,R,0
397,398,R,0
398,399,R,0


In [755]:
pred=pd.read_csv('MUCS_Submission_1.csv')
pred.head()

Unnamed: 0,File No,Real/Fake
0,318,F
1,355,R
2,256,R
3,30,R
4,150,F


In [756]:
# Sorted predicted file
sorted_pred = pred.sort_values(by ='File No' )
sorted_pred.head()

Unnamed: 0,File No,Real/Fake
7,1,F
248,2,R
265,3,F
234,4,F
395,5,R


In [757]:
#sorted_pred.loc[sorted_pred['File_No'] == 30]

In [758]:
# Added a new column to make labels in Numeric form, like F --> 1, R --> 0

sorted_pred.insert(2, "label", sorted_pred['Real/Fake'], True) 

In [759]:
#Replace Fake News article label with 1 (F -- > 1)
#Replace Real News article label with 0 (R -- > 0)

sorted_pred["label"].replace({"F": 1, "R": 0}, inplace=True)

In [760]:
sorted_pred

Unnamed: 0,File No,Real/Fake,label
7,1,F,1
248,2,R,0
265,3,F,1
234,4,F,1
395,5,R,0
...,...,...,...
96,396,R,0
339,397,R,0
134,398,R,0
296,399,R,0


In [761]:
pred_pos = sorted_pred.loc[sorted_pred['label'] == 1]
pred_pos

Unnamed: 0,File No,Real/Fake,label
7,1,F,1
265,3,F,1
234,4,F,1
159,10,F,1
254,12,F,1
...,...,...,...
198,380,F,1
235,382,F,1
169,388,F,1
266,389,F,1


In [762]:
pred_neg = sorted_pred.loc[sorted_pred['label'] == 0]
pred_neg

Unnamed: 0,File No,Real/Fake,label
248,2,R,0
395,5,R,0
313,6,R,0
30,7,R,0
290,8,R,0
...,...,...,...
96,396,R,0
339,397,R,0
134,398,R,0
296,399,R,0


In [763]:
#pred_pos[~pred_pos.isin(act_pos)].dropna()

In [764]:
#Find Common Rows
correct_fake = act_pos.merge(pred_pos, how = 'inner' ,indicator=False)
correct_fake

Unnamed: 0,File No,Real/Fake,label
0,1,F,1
1,3,F,1
2,4,F,1
3,10,F,1
4,12,F,1
...,...,...,...
89,367,F,1
90,368,F,1
91,369,F,1
92,388,F,1


In [765]:
#Find Rows in act_pos Which Are Not Available in pred_pos
misclassified_fakee = act_pos.merge(pred_pos, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']
misclassified_fake = misclassified_fakee.drop("_merge", axis=1)
misclassified_fake

Unnamed: 0,File No,Real/Fake,label
1,2,F,1
4,5,F,1
6,11,F,1
9,19,F,1
10,20,F,1
13,23,F,1
18,32,F,1
20,38,F,1
28,61,F,1
33,66,F,1


In [766]:
len(misclassified_fake)

56

In [767]:
#pred_neg[~pred_neg.isin(act_neg)].dropna()

In [768]:
##Find Common Rows
correct_real = act_neg.merge(pred_neg, how = 'inner' ,indicator=False)
correct_real

Unnamed: 0,File No,Real/Fake,label
0,6,R,0
1,7,R,0
2,8,R,0
3,9,R,0
4,14,R,0
...,...,...,...
219,396,R,0
220,397,R,0
221,398,R,0
222,399,R,0


In [769]:
#Find Rows in act_pos Which Are Not Available in pred_pos
misclassified_real = act_neg.merge(pred_neg, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='left_only']
misclassified_real= misclassified_real.drop("_merge", axis=1)
misclassified_real

Unnamed: 0,File No,Real/Fake,label
6,16,R,0
13,34,R,0
17,41,R,0
33,68,R,0
45,91,R,0
63,124,R,0
65,134,R,0
66,135,R,0
70,146,R,0
74,158,R,0


In [770]:
len(misclassified_real)

26

In [771]:
# define predictions for Precision

#150 (total predicions) = 94 + 26

frames_1 = [correct_fake, misclassified_real ]
pred_pos = pd.concat(frames_1)

#250 (total predicions) = 56 + 224 
frames_2 = [misclassified_fake, correct_real]
pred_neg = pd.concat(frames_2)

In [772]:
pred_pos

Unnamed: 0,File No,Real/Fake,label
0,1,F,1
1,3,F,1
2,4,F,1
3,10,F,1
4,12,F,1
...,...,...,...
225,373,R,0
228,376,R,0
232,380,R,0
234,382,R,0


In [773]:
pred_neg

Unnamed: 0,File No,Real/Fake,label
1,2,F,1
4,5,F,1
6,11,F,1
9,19,F,1
10,20,F,1
...,...,...,...
219,396,R,0
220,397,R,0
221,398,R,0
222,399,R,0


In [774]:
frames_3 = [pred_pos, pred_neg]

y_pred = pd.concat(frames_3)
#y_pred= y_predd.sort_values(by ='File No' )
y_pred

Unnamed: 0,File No,Real/Fake,label
0,1,F,1
1,3,F,1
2,4,F,1
3,10,F,1
4,12,F,1
...,...,...,...
219,396,R,0
220,397,R,0
221,398,R,0
222,399,R,0


In [775]:
precision = precision_score(y_true['label'], y_pred['label'], average='binary')
print('Precision: %.3f' % precision)

Precision: 0.827


In [776]:
# calculate recall
recall = recall_score(y_true['label'], y_pred['label'], average='binary')
print('Recall score is : %.3f' % recall)

Recall score is : 0.827


In [777]:
print('Accuracy of the model is: ', accuracy_score(orig['label'], sorted_pred['label']))

Accuracy of the model is:  0.795


In [782]:
# define predictions for Recall

#150 (total predicions) = 94 + 56

frames_4 = [correct_fake, misclassified_fake]
pred_pos_recall = pd.concat(frames_4)

pred_pos_recal=pred_pos_recall.sort_values(by ='File No' )
pred_pos_recal

Unnamed: 0,File No,Real/Fake,label
0,1,F,1
1,2,F,1
1,3,F,1
2,4,F,1
4,5,F,1
...,...,...,...
90,368,F,1
91,369,F,1
92,388,F,1
93,389,F,1


In [783]:
# calculate recall
recall = recall_score(act_pos['label'], pred_pos_recal['label'], average='binary')
print('Recall: %.3f' % recall)

Recall: 1.000


### MUCS submission evaluation

In [10]:
gold_df = pd.read_excel('./data/Orignal_labels.xlsx')
gold_df.shape, gold_df.columns

((400, 2), Index(['File No', 'Real/Fake'], dtype='object'))

In [11]:
mucs_df = pd.read_csv('./data/submissions/MUCS_Submission_1.csv')

In [12]:
mucs_df.shape, mucs_df.columns

((400, 2), Index(['File No', 'Real/Fake'], dtype='object'))

### Turning into 1/0

In [13]:
gold_df['label'] = gold_df['Real/Fake'].apply(lambda x: 1 if x == 'F' else 0)

In [18]:
gold_df.to_csv('./data/gold_labels.csv', index=False, header=True)

In [21]:
mucs_df['label'] = mucs_df['Real/Fake'].apply(lambda x: 1 if x == 'F' else 0)

In [24]:
mucs_df_sorted = mucs_df.sort_values(by='File No')

In [29]:
mucs_df_sorted[mucs_df_sorted['File No'] == 150]

Unnamed: 0,File No,Real/Fake,label
4,150,F,1


### Evaluating P, R, F1

In [32]:
precision_score(gold_df['label'], mucs_df_sorted['label'])

0.7833333333333333

In [37]:
recall_score(gold_df['label'], mucs_df_sorted['label'], average='binary')

0.6266666666666667

In [36]:
accuracy_score(gold_df['label'], mucs_df_sorted['label'])

0.795