In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re
import math
from collections import Counter
import traceback

In [2]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector_counter(text):
    word = re.compile(r'[^\d\W]+\b') # word = re.compile(r'\w+')
    words = word.findall(text)
    return Counter(words)


def cosine_similarity(content_a, content_b):

    vector1 = text_to_vector_counter(content_a)
    vector2 = text_to_vector_counter(content_b)

    cosine_result = get_cosine(vector1, vector2)
    return cosine_result

In [3]:
def text_to_vector(text):
    text=text.lower()
    word = re.compile(r'[^\W]+\b') # word = re.compile(r'\w+')
    words = word.findall(text)
    return words

In [4]:
x='VepsÃ¤lÃ¤inen, Juho'
x=re.sub('[^\w ]','',x)
x

'VepsÃlÃinen Juho'

In [5]:
x="VepsÃ¤lÃ¤inen, Juho, Asimuzzaman, Md, Pinku Deb Nath, Farah Hossain, Asif Hossain, and Rashedur M"
y="rashedur"

x=re.sub('[^\w ]','',x)
author_list=text_to_vector(x)
text_list=text_to_vector(y)


In [6]:
author_list

['vepsãlãinen',
 'juho',
 'asimuzzaman',
 'md',
 'pinku',
 'deb',
 'nath',
 'farah',
 'hossain',
 'asif',
 'hossain',
 'and',
 'rashedur',
 'm']

In [7]:
check =  all(item in author_list for item in text_list)

check


True

In [8]:
csv_dir=r'C:\Users\Saranga\Desktop\Devopedia\Work\Devopedia\csv_outputs_from_HTML3'
output_dir=r'C:\Users\Saranga\Desktop\Devopedia\Work\Devopedia\csv_outputs_encodings3'

In [9]:


for _,_,files in os.walk(csv_dir):
    for file in tqdm(files):
        

        fpath=os.path.join(csv_dir,file)
        
        title_enc=[]
        author_enc=[]
        yop_enc=[]
        
        df=pd.read_csv(fpath)
        
        if df.size==0:
            continue
        
#         temp_author=re.sub('[^\w ]','',df.Author[0])
        author_list=text_to_vector(df.Author[0])

        for i in df.iterrows():
            
            if str(i[1].Title) in str(i[1].Text) or cosine_similarity(str(i[1].Title),str(i[1].Text))>0.97:
                title_enc.append(1)
            else:
                title_enc.append(0)
                
#                 title_list=text_to_vector(str(i[1].Title))
#                 text_list=text_to_vector(str(i[1].Text))

#                 check=all(item in text_list for item in title_list)

#                 if check:
#                     title_enc.append(1)
#                 else:
#                     title_enc.append(0)
                
#             if str(i[1].Author) in str(i[1].Text) or textdistance.jaro_winkler(str(i[1].Author),str(i[1].Text))>0.7:

            

            
            text_list=text_to_vector(str(i[1].Text))
    
            if text_list!=['and']:
            
                check=all(item in author_list for item in text_list)

                if check:
                    author_enc.append(1)
                else:
                    author_enc.append(0)

            else:
                author_enc.append(0)


            
                
                
#             if str(i[1].YoPublishing) in str(i[1].Text):
#                 yop_enc.append(1)
#             else:
#                 yop_enc.append(0)
                

            yop_list=text_to_vector(str(i[1].YoPublishing))
            
            check=all(item in yop_list for item in text_list)

            if check:
                yop_enc.append(1)
            else:
                yop_enc.append(0)
                
        if 1 not in author_enc:
            
            if len(author_list)>1:
            
                author_enc=[]

                for index in df.iterrows():


                    text_list=text_to_vector(str(index[1].Text))

                    check=all(item in text_list for item in author_list)
                    if check:
                        author_enc.append(1)
                    else:
                        author_enc.append(0)
            
        
                
        df['Title_encoding']=title_enc
        df['Author_encoding']=author_enc
        df['YoP_encoding']=yop_enc
        
        df['SerialNo']=list(range(1,len(title_enc)+1))    #or len(author_enc) or len(yop_encoding)
        
        
        df=df[['YoPublishing','Title','Author','fname','Title_encoding',
               'Author_encoding','YoP_encoding','SerialNo','Tag','Text']]

    
        output_path=os.path.join(output_dir,file)
        df.to_csv(output_path,index=False)
        


100%|██████████████████████████████████████████████████████████████████████████████| 6761/6761 [12:53<00:00,  8.74it/s]


In [10]:
enc_csv_dir=output_dir

In [11]:
Author_distinct_count=[]
Title_distinct_count=[]
fnames=[]
Authors=[]
Titles=[]
error_count=0

row_count=0

df_all=pd.DataFrame(columns=['YoPublishing','Title','Author','fname','Title_encoding','Author_encoding',
                             'YoP_encoding','SerialNo','Tag','Text'])


for _,_,files in os.walk(enc_csv_dir):
    for file in tqdm(files):
        
        fpath=os.path.join(enc_csv_dir,file)

        df=pd.read_csv(fpath)
            
        try:
            df_all = df_all.append(df, ignore_index = True)

            fnames.append(df.fname[0])

            Author_distinct_count.append(df[df.Author_encoding==1].Text.nunique())
            Authors.append(df.Author[0])
            Titles.append(df.Title[0])
            Title_distinct_count.append(df[df.Title_encoding==1].Text.nunique())
            
            row_count+=len(df)
            
        except:

            traceback.print_exc()
            error_count+=1
        

100%|██████████████████████████████████████████████████████████████████████████████| 6761/6761 [33:23<00:00,  3.38it/s]


In [12]:
df_all.shape

(1969471, 10)

In [13]:
master_output_path=r'C:\Users\Saranga\Desktop\Devopedia\Work\articleRefs.v2525'

all_csv_file_path=os.path.join(master_output_path,'all_full_6.csv')
df_all.to_csv(all_csv_file_path,index=False)

In [14]:
Author_distinct_count_df=pd.DataFrame(list(zip(fnames,Author_distinct_count,Authors)),
                                         columns=['Fname','Distinct_authors_count','Authors'])
Author_distinct_count_df=Author_distinct_count_df.sort_values(by=['Distinct_authors_count'], ascending=True)

Author_output_path=os.path.join(master_output_path,'Author_diagnostics_full_6.csv')
Author_distinct_count_df.to_csv(Author_output_path,index=False)


Title_distinct_count_df=pd.DataFrame(list(zip(fnames,Title_distinct_count,Titles)),
                                         columns=['Fname','Distinct_titles_count','Titles'])
Title_distinct_count_df=Title_distinct_count_df.sort_values(by=['Distinct_titles_count'], ascending=True)

Title_output_path=os.path.join(master_output_path,'Title_diagnostics_full_6.csv')  
Title_distinct_count_df.to_csv(Title_output_path,index=False)

In [15]:
Author_distinct_count_df

Unnamed: 0,Fname,Distinct_authors_count,Authors
3963,975cd5bbfc391a7916cbd6693137107cc9f913a717c4f0...,0,3GPP
5382,ca777f66129d6f2e31d6192c675ce1945b4f95ea5aa548...,0,KeyCDN
1392,35301d2e1ec456d49cef89b294faf029d2feda3506dcc7...,0,Accenture
2707,68deb4b9287e1584a0a85c634c1f6a33318a98de32f5ae...,0,SiFive
5385,caa07ca710574370dea4aa7f33fa6bdbd4d25aac991cbf...,0,Marketwired
...,...,...,...
3490,85aa7560cd4e7ecf119f85190a68f82f710219299c5bd9...,126,Google Cloud
6066,e5a7ce77bd98df91ab464bbc3dcf5d3d021c4141a7ff1e...,135,Google Cloud
5789,d9cf605d2c097a7d57426458affc5d89e1a86d7d79b631...,135,Google Cloud
3550,87fe36fbed0c4447240650a875536c1cd9119530ae2854...,138,Google Cloud


In [16]:
tag_special_char_check=re.compile('[^a-zA-Z0-9]')

In [17]:
x='Z3%al'

print(tag_special_char_check.search(x)!=None)#exclude tags having special characters 
                 

True
