In [27]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import re
import nltk
import seaborn as sn

In [7]:
df = pd.read_csv('NYTGeneraldata1.csv', index_col=0)
df.head()

Unnamed: 0,abstract,headline,pub_date
0,about a fifth of the nation’s economy depends ...,thailand reopens to tourists from 63 countries...,2021-11-01 12:26:03+00:00
1,as some try to lure audiences back with short ...,a 6-hour opera in a pandemic? the met goes for...,2021-11-01 14:14:46+00:00
2,"while big companies wield considerable power, ...",how the pandemic has added to labor unrest,2021-11-01 15:06:46+00:00
3,the labor department has completed a draft of ...,the biden administration will publish vaccine ...,2021-11-01 16:29:36+00:00
4,"after months of production delays, this is the...","the novavax vaccine, backed by operation warp ...",2021-11-01 16:40:48+00:00


In [9]:
df.dtypes

abstract    object
headline    object
pub_date    object
dtype: object

### pub_date still has datatype of object for some reason, even though we changed it in the data extraction process. Converting to date time. 

In [12]:
df['pub_date']=pd.to_datetime(df['pub_date'])
df

Unnamed: 0,abstract,headline,pub_date
0,about a fifth of the nation’s economy depends ...,thailand reopens to tourists from 63 countries...,2021-11-01 12:26:03+00:00
1,as some try to lure audiences back with short ...,a 6-hour opera in a pandemic? the met goes for...,2021-11-01 14:14:46+00:00
2,"while big companies wield considerable power, ...",how the pandemic has added to labor unrest,2021-11-01 15:06:46+00:00
3,the labor department has completed a draft of ...,the biden administration will publish vaccine ...,2021-11-01 16:29:36+00:00
4,"after months of production delays, this is the...","the novavax vaccine, backed by operation warp ...",2021-11-01 16:40:48+00:00
...,...,...,...
361,the company said that analyses and modeling of...,regeneron says its antibody treatment may not ...,2021-11-30 16:03:10+00:00
362,intense research into the new coronavirus vari...,omicron: what is known — and still unknown,2021-11-30 17:08:17+00:00
363,britain’s approach to coronavirus-related rest...,"amid variant fears, u.k. discovers limits to i...",2021-11-30 18:14:44+00:00
364,"unlike alpha, beta and delta, the name of the ...",how do you say ‘omicron’?,2021-11-30 18:51:06+00:00


In [13]:
df.dtypes

abstract                 object
headline                 object
pub_date    datetime64[ns, UTC]
dtype: object

### Adding a new date column without the UTC to see how many relevant articles there are per day (might be easier for plotting/analysis later)

In [14]:
df['DATE']= pd.to_datetime(df['pub_date']).dt.date
df

Unnamed: 0,abstract,headline,pub_date,DATE
0,about a fifth of the nation’s economy depends ...,thailand reopens to tourists from 63 countries...,2021-11-01 12:26:03+00:00,2021-11-01
1,as some try to lure audiences back with short ...,a 6-hour opera in a pandemic? the met goes for...,2021-11-01 14:14:46+00:00,2021-11-01
2,"while big companies wield considerable power, ...",how the pandemic has added to labor unrest,2021-11-01 15:06:46+00:00,2021-11-01
3,the labor department has completed a draft of ...,the biden administration will publish vaccine ...,2021-11-01 16:29:36+00:00,2021-11-01
4,"after months of production delays, this is the...","the novavax vaccine, backed by operation warp ...",2021-11-01 16:40:48+00:00,2021-11-01
...,...,...,...,...
361,the company said that analyses and modeling of...,regeneron says its antibody treatment may not ...,2021-11-30 16:03:10+00:00,2021-11-30
362,intense research into the new coronavirus vari...,omicron: what is known — and still unknown,2021-11-30 17:08:17+00:00,2021-11-30
363,britain’s approach to coronavirus-related rest...,"amid variant fears, u.k. discovers limits to i...",2021-11-30 18:14:44+00:00,2021-11-30
364,"unlike alpha, beta and delta, the name of the ...",how do you say ‘omicron’?,2021-11-30 18:51:06+00:00,2021-11-30


In [15]:
df.dtypes

abstract                 object
headline                 object
pub_date    datetime64[ns, UTC]
DATE                     object
dtype: object

In [16]:
df['DATE']=pd.to_datetime(df['DATE'])

In [17]:
df.dtypes

abstract                 object
headline                 object
pub_date    datetime64[ns, UTC]
DATE             datetime64[ns]
dtype: object

### Number of covid related articles per day

In [30]:
count=df.groupby('DATE').size()
count

DATE
2021-11-01     9
2021-11-02    15
2021-11-03    19
2021-11-04    20
2021-11-05    11
2021-11-06     5
2021-11-07     4
2021-11-08    11
2021-11-09    16
2021-11-10     9
2021-11-11     8
2021-11-12     8
2021-11-13    10
2021-11-14     6
2021-11-15    11
2021-11-16    16
2021-11-17    21
2021-11-18    17
2021-11-19     9
2021-11-20    11
2021-11-21     8
2021-11-22    10
2021-11-23    16
2021-11-24    15
2021-11-25     6
2021-11-26    14
2021-11-27    11
2021-11-28    13
2021-11-29    22
2021-11-30    15
dtype: int64

### Using TextBlob to find polarity and subjectivity sentiment of abstract
#### Sugya's code! Thank you!

In [18]:
polarity=lambda x: TextBlob(x).sentiment.polarity
subjectivity=lambda x: TextBlob(x).sentiment.subjectivity
df['Abstract Polarity']=df['abstract'].apply(polarity)
df['Abstract Subjectivity']=df['abstract'].apply(subjectivity)
df

Unnamed: 0,abstract,headline,pub_date,DATE,Abstract Polarity,Abstract Subjectivity
0,about a fifth of the nation’s economy depends ...,thailand reopens to tourists from 63 countries...,2021-11-01 12:26:03+00:00,2021-11-01,0.300000,0.450000
1,as some try to lure audiences back with short ...,a 6-hour opera in a pandemic? the met goes for...,2021-11-01 14:14:46+00:00,2021-11-01,0.000000,0.150000
2,"while big companies wield considerable power, ...",how the pandemic has added to labor unrest,2021-11-01 15:06:46+00:00,2021-11-01,0.087273,0.290909
3,the labor department has completed a draft of ...,the biden administration will publish vaccine ...,2021-11-01 16:29:36+00:00,2021-11-01,-0.033333,0.133333
4,"after months of production delays, this is the...","the novavax vaccine, backed by operation warp ...",2021-11-01 16:40:48+00:00,2021-11-01,0.250000,0.333333
...,...,...,...,...,...,...
361,the company said that analyses and modeling of...,regeneron says its antibody treatment may not ...,2021-11-30 16:03:10+00:00,2021-11-30,0.350000,0.550000
362,intense research into the new coronavirus vari...,omicron: what is known — and still unknown,2021-11-30 17:08:17+00:00,2021-11-30,0.146591,0.446970
363,britain’s approach to coronavirus-related rest...,"amid variant fears, u.k. discovers limits to i...",2021-11-30 18:14:44+00:00,2021-11-30,-0.008333,0.158333
364,"unlike alpha, beta and delta, the name of the ...",how do you say ‘omicron’?,2021-11-30 18:51:06+00:00,2021-11-30,0.291667,0.425000


### Categorising abstract polarity

In [19]:
def calculate_polarity(Polarity):
        if Polarity > 0.75:
            return "Extremely positive"
        elif Polarity > 0.5:
            return "Significantly positive"
        elif Polarity > 0.3:
            return "Fairly positive"
        elif Polarity > 0.1:
            return "Slightly positive"
        elif Polarity < -0.1:
            return "Slightly negative"
        elif Polarity < -0.3:
            return "Fairly negative"
        elif Polarity < -0.5:
            return "Significantly negative"
        elif Polarity < -0.75:
            return "Extremely negative"
        else:
            return "Neutral"
df['Polarity Category'] = df['Abstract Polarity'].apply(calculate_polarity)
df

Unnamed: 0,abstract,headline,pub_date,DATE,Abstract Polarity,Abstract Subjectivity,Polarity Category
0,about a fifth of the nation’s economy depends ...,thailand reopens to tourists from 63 countries...,2021-11-01 12:26:03+00:00,2021-11-01,0.300000,0.450000,Slightly positive
1,as some try to lure audiences back with short ...,a 6-hour opera in a pandemic? the met goes for...,2021-11-01 14:14:46+00:00,2021-11-01,0.000000,0.150000,Neutral
2,"while big companies wield considerable power, ...",how the pandemic has added to labor unrest,2021-11-01 15:06:46+00:00,2021-11-01,0.087273,0.290909,Neutral
3,the labor department has completed a draft of ...,the biden administration will publish vaccine ...,2021-11-01 16:29:36+00:00,2021-11-01,-0.033333,0.133333,Neutral
4,"after months of production delays, this is the...","the novavax vaccine, backed by operation warp ...",2021-11-01 16:40:48+00:00,2021-11-01,0.250000,0.333333,Slightly positive
...,...,...,...,...,...,...,...
361,the company said that analyses and modeling of...,regeneron says its antibody treatment may not ...,2021-11-30 16:03:10+00:00,2021-11-30,0.350000,0.550000,Fairly positive
362,intense research into the new coronavirus vari...,omicron: what is known — and still unknown,2021-11-30 17:08:17+00:00,2021-11-30,0.146591,0.446970,Slightly positive
363,britain’s approach to coronavirus-related rest...,"amid variant fears, u.k. discovers limits to i...",2021-11-30 18:14:44+00:00,2021-11-30,-0.008333,0.158333,Neutral
364,"unlike alpha, beta and delta, the name of the ...",how do you say ‘omicron’?,2021-11-30 18:51:06+00:00,2021-11-30,0.291667,0.425000,Slightly positive


### Categorising abstract subjectivity

In [20]:
def calculate_subjectivity(subjectivity):
        if subjectivity > 0.75:
            return "Extremely subjective"
        elif subjectivity > 0.5:
            return "Fairly subjective"
        elif subjectivity > 0.3:
            return "Fairly objective"
        elif subjectivity > 0.1:
            return "Extremely objective"   
df['Abstract Subjectivity Category'] = df['Abstract Subjectivity'].apply(calculate_subjectivity)
df.head()

Unnamed: 0,abstract,headline,pub_date,DATE,Abstract Polarity,Abstract Subjectivity,Polarity Category,Abstract Subjectivity Category
0,about a fifth of the nation’s economy depends ...,thailand reopens to tourists from 63 countries...,2021-11-01 12:26:03+00:00,2021-11-01,0.3,0.45,Slightly positive,Fairly objective
1,as some try to lure audiences back with short ...,a 6-hour opera in a pandemic? the met goes for...,2021-11-01 14:14:46+00:00,2021-11-01,0.0,0.15,Neutral,Extremely objective
2,"while big companies wield considerable power, ...",how the pandemic has added to labor unrest,2021-11-01 15:06:46+00:00,2021-11-01,0.087273,0.290909,Neutral,Extremely objective
3,the labor department has completed a draft of ...,the biden administration will publish vaccine ...,2021-11-01 16:29:36+00:00,2021-11-01,-0.033333,0.133333,Neutral,Extremely objective
4,"after months of production delays, this is the...","the novavax vaccine, backed by operation warp ...",2021-11-01 16:40:48+00:00,2021-11-01,0.25,0.333333,Slightly positive,Fairly objective


### Reordering columns for ease of interpretation

In [21]:
df=df[['pub_date','DATE','headline','abstract','Abstract Polarity','Polarity Category','Abstract Subjectivity','Abstract Subjectivity Category']]
df.head()

Unnamed: 0,pub_date,DATE,headline,abstract,Abstract Polarity,Polarity Category,Abstract Subjectivity,Abstract Subjectivity Category
0,2021-11-01 12:26:03+00:00,2021-11-01,thailand reopens to tourists from 63 countries...,about a fifth of the nation’s economy depends ...,0.3,Slightly positive,0.45,Fairly objective
1,2021-11-01 14:14:46+00:00,2021-11-01,a 6-hour opera in a pandemic? the met goes for...,as some try to lure audiences back with short ...,0.0,Neutral,0.15,Extremely objective
2,2021-11-01 15:06:46+00:00,2021-11-01,how the pandemic has added to labor unrest,"while big companies wield considerable power, ...",0.087273,Neutral,0.290909,Extremely objective
3,2021-11-01 16:29:36+00:00,2021-11-01,the biden administration will publish vaccine ...,the labor department has completed a draft of ...,-0.033333,Neutral,0.133333,Extremely objective
4,2021-11-01 16:40:48+00:00,2021-11-01,"the novavax vaccine, backed by operation warp ...","after months of production delays, this is the...",0.25,Slightly positive,0.333333,Fairly objective


### Mean abstract polarity and subjectivity for each day of the month

In [33]:
df.groupby('DATE').mean()

Unnamed: 0_level_0,Abstract Polarity,Abstract Subjectivity
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-11-01,0.095561,0.290903
2021-11-02,0.050026,0.312923
2021-11-03,0.074514,0.356148
2021-11-04,0.029312,0.346657
2021-11-05,0.015987,0.290618
2021-11-06,0.158333,0.304167
2021-11-07,0.24803,0.405606
2021-11-08,-0.01124,0.24167
2021-11-09,0.083969,0.276681
2021-11-10,-0.003889,0.331235
