In [1]:
import os
import pandas as pd
import plotly
import plotly_express as px
import plotly.graph_objects as go

In [2]:
def extractDateFeatures(df,datetime_col):
    df['Date']=df[datetime_col].dt.date
    df['Year']=df[datetime_col].apply(lambda x:x.year)
    df['Month']=df[datetime_col].apply(lambda x:x.month)
    df['DayOfWeek']=df[datetime_col].dt.dayofweek
    df['Hour']=df[datetime_col].apply(lambda x:x.hour)
    df['Minute']=df[datetime_col].apply(lambda x:x.minute)
    return df

def flattenList(nested_list):
    flat_list = [item for sublist in nested_list for item in sublist]
    return flat_list

In [3]:
data = pd.read_csv('..\data\interim\prepared_data.csv')

  data = pd.read_csv('..\data\interim\prepared_data.csv')


In [4]:
print("Number of Papers Related to AI and ML is ",data.shape[0])
print('Number of columns in data: ', data.shape[1])

Number of Papers Related to AI and ML is  132794
Number of columns in data:  7


In [5]:
data.head()

Unnamed: 0,id,authors,title,abstract,category,prepared_text,version
0,704.1267,"Laurence Likforman-Sulem, Abderrazak Zahour, B...",Text Line Segmentation of Historical Documents...,There is a huge amount of historical documents...,Computer Vision and Pattern Recognition,Text Line Segmentation of Historical Documents...,"Tue, 10 Apr 2007 16:26:42 GMT"
1,704.1274,David H. Wolpert and Dev G. Rajnarayan,Parametric Learning and Monte Carlo Optimization,This paper uncovers and explores the close rel...,Machine Learning,Parametric Learning and Monte Carlo Optimizati...,"Tue, 10 Apr 2007 17:01:07 GMT"
2,704.1294,"Ahmed Sidky, James Arthur, Shawn Bohner",A Disciplined Approach to Adopting Agile Pract...,Many organizations aspire to adopt agile proce...,Software Engineering,A Disciplined Approach to Adopting Agile Pract...,"Tue, 10 Apr 2007 19:11:51 GMT"
3,704.1373,"Burgy Laurent (INRIA Futurs), Laurent R\'eveil...",A Language-Based Approach for Improving the Ro...,The secure and robust functioning of a network...,Programming Languages,A Language-Based Approach for Improving the Ro...,"Wed, 11 Apr 2007 08:35:32 GMT"
4,704.1394,"Tarik Hadzic, Rune Moller Jensen, Henrik Reif ...",Calculating Valid Domains for BDD-Based Intera...,In these notes we formally describe the functi...,Artificial Intelligence,Calculating Valid Domains for BDD-Based Intera...,"Wed, 11 Apr 2007 10:59:56 GMT"


In [6]:
data['DateTime'] = pd.to_datetime(data['version'])
data = extractDateFeatures(data,"DateTime")

In [7]:
papers_over_years=data.groupby(['Year']).size().reset_index().rename(columns={0:'Number Of Papers Published'})
fig_one = px.line(x="Year",y="Number Of Papers Published",data_frame=papers_over_years,title="Growth of AI ML over the Years")
fig_one.show()

In [8]:
papers_published_over_days=data.groupby(['Date']).size().reset_index().rename(columns={0:'Papers Published By Date'})
fig_two = px.line(x="Date",y="Papers Published By Date",data_frame=papers_published_over_days,title="Average Papers Published Over Each Day")
fig_two.show()

In [9]:
data['num_authors'] = data['authors'].apply(lambda x:len(x))
data['authors'] = data['authors'].apply(lambda x: [name.strip() for name in x.split(',') if name.strip()])
data.head()

Unnamed: 0,id,authors,title,abstract,category,prepared_text,version,DateTime,Date,Year,Month,DayOfWeek,Hour,Minute,num_authors
0,704.1267,"[Laurence Likforman-Sulem, Abderrazak Zahour, ...",Text Line Segmentation of Historical Documents...,There is a huge amount of historical documents...,Computer Vision and Pattern Recognition,Text Line Segmentation of Historical Documents...,"Tue, 10 Apr 2007 16:26:42 GMT",2007-04-10 16:26:42,2007-04-10,2007,4,1,16,26,58
1,704.1274,[David H. Wolpert and Dev G. Rajnarayan],Parametric Learning and Monte Carlo Optimization,This paper uncovers and explores the close rel...,Machine Learning,Parametric Learning and Monte Carlo Optimizati...,"Tue, 10 Apr 2007 17:01:07 GMT",2007-04-10 17:01:07,2007-04-10,2007,4,1,17,1,38
2,704.1294,"[Ahmed Sidky, James Arthur, Shawn Bohner]",A Disciplined Approach to Adopting Agile Pract...,Many organizations aspire to adopt agile proce...,Software Engineering,A Disciplined Approach to Adopting Agile Pract...,"Tue, 10 Apr 2007 19:11:51 GMT",2007-04-10 19:11:51,2007-04-10,2007,4,1,19,11,39
3,704.1373,"[Burgy Laurent (INRIA Futurs), Laurent R\'evei...",A Language-Based Approach for Improving the Ro...,The secure and robust functioning of a network...,Programming Languages,A Language-Based Approach for Improving the Ro...,"Wed, 11 Apr 2007 08:35:32 GMT",2007-04-11 08:35:32,2007-04-11,2007,4,2,8,35,120
4,704.1394,"[Tarik Hadzic, Rune Moller Jensen, Henrik Reif...",Calculating Valid Domains for BDD-Based Intera...,In these notes we formally describe the functi...,Artificial Intelligence,Calculating Valid Domains for BDD-Based Intera...,"Wed, 11 Apr 2007 10:59:56 GMT",2007-04-11 10:59:56,2007-04-11,2007,4,2,10,59,54


In [10]:
ai_authors=pd.DataFrame(data['authors'].tolist()).rename(columns={0:'authors'})
papers_by_authors=ai_authors.groupby(['authors']).size().reset_index().rename(columns={0:'Number of Papers Published'}).sort_values("Number of Papers Published",ascending=False).head(10)
fig_three = px.bar(x="Number of Papers Published",y="authors",data_frame=papers_by_authors.sort_values("Number of Papers Published",ascending=True),title="Top 10 Popular Authors",orientation="h")
fig_three.show()

In [16]:
fig_one.write_image(r'..\reports\figures\growth_ai.jpg')
fig_two.write_image(r'..\reports\figures\average_papers.jpg')
fig_three.write_image(r'..\reports\figures\popular_authors.jpg')