In [152]:
#https://github.com/gregversteeg/corex_topic

In [153]:
import nltk
import pandas as pd

In [154]:
from corextopic import corextopic as ct

# CorEx on Positive Articles

In [155]:
import pandas as pd

wri = pd.read_csv("wri.csv",index_col=0)

wri = wri[wri['labels']=='positive']
wri.reset_index(drop=True, inplace=True)

print(wri.head())
print(wri.info())

     labels                                               text
0  positive  Rashtriya Swayamsevak Sangh chief Mohan Bhagwa...
1  positive  By Shreehari PaliathIndian farms produced reco...
2  positive  AdvertisementPartha PaulA promoter was gunned ...
3  positive  indiaUpdated Jan    ISTUnion home minister Raj...
4  positive  Four days after a fire put the spotlight back ...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 456 entries, 0 to 455
Data columns (total 2 columns):
labels    456 non-null object
text      456 non-null object
dtypes: object(2)
memory usage: 7.2+ KB
None


In [156]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=.5,
    min_df=2,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False,
    strip_accents = 'unicode',
    stop_words = 'english'
)

vectorizer = vectorizer.fit(wri['text'])
tfidf = vectorizer.transform(wri['text'])
vocab = vectorizer.get_feature_names()
print(len(vocab))
print(vocab)


16306


In [157]:
from corextopic import corextopic as ct

In [158]:
TOPICS = 100
NBR_OF_WORDS = 7

In [159]:
anchors = []
model = ct.Corex(n_hidden=TOPICS, seed=42)
model = model.fit(
    tfidf,
    words=vocab
)

#### ['land', 'acre','hectares', 'acquisition', 'land acquisition', 'agricultural', 'acres', 'degradation','landslides','property','resettlement'],
   #['farmer', 'farming', 'agricultural', 'produce', 'crop', 'crops', 'agrarian', 'farms','farm','field','fields','soil','sugarcane','vegetables','farmers','agriculture','tractor','prices crops', 'debt','quota','food','fruits','livestock','cow','wheat','harvest','harvesting','horticulture','loan','loans','milk','paddy','rice','plant','plants','potatoes','potato'],
   #['mining', 'coal', 'miner', 'miners','sand mining', 'sand','bauxite','iron ore','limestone','manganese ore','granite'],
   #['forest','forests', 'forest department', 'reserve', 'forest officials','forestry'],
   #['animal','leopard','leopards', 'animals', 'wildlife', 'tiger', 'attacked', 'slaughter', 'lion','lions', 'threat', 'tigress', 'bear','birds','cat','cattle','crocodile','elephant','elephants','pangolin','pangolins','species'],
   #['drought', 'droughts','monsoon', 'rain','rains','rainfall','disaster'],
   #['water', 'irrigation', 'monsoon', 'rain', 'flood', 'floods', 'flooded', 'climate change','climate','dam','dams','drinking']

In [160]:
# Anchors designed to nudge the model towards measuring specific genres
anchors = [
    ['land','resettlement','degradation','plot'],
    ['farm','Farmers','crop','agriculture','crops','agrarian','farmer','farmers''cows','tractor','acre','fields','livestock','harvest','harvesting','potato','sugarcane','paddy','rice','milk'],
    ['mining', 'coal', 'miner', 'miners','sand mining', 'sand','bauxite','iron ore','limestone','manganese ore','granite'],
    ['forest','deforestation','trees'],
    ['animal','attacked','leopard','leopards','tiger','tigress','crocodile'],
    ['drought','rain','climate change'],
    ['water','dams','irrigation','flood','drinking']   
   
]
anchors = [
    [a for a in topic if a in vocab]
    for topic in anchors
]

model = ct.Corex(n_hidden=TOPICS, seed=42)
model = model.fit(
    tfidf,
    words=vocab,
    anchors=anchors, # Pass the anchors in here
    anchor_strength=100 # Tell the model how much it should rely on the anchors
)

In [161]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=NBR_OF_WORDS)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

Topic #1: land, plot, degradation, resettlement, land acquisition, acquisition, act
Topic #2: crops, farm, agriculture, crop, farmer, paddy, sugarcane
Topic #3: mining, coal, sand, miners, sand mining, iron ore, bauxite
Topic #4: forest, trees, deforestation, forest department, wildlife, forests, conservation
Topic #5: animal, attacked, tiger, leopard, tigress, leopards, crocodile
Topic #6: drought, climate change, rain, climate, change, rainfall, rains
Topic #7: water, irrigation, drinking, dams, flood, river, drinking water
Topic #8: double, clashes, incomes, image, came, fought, plains
Topic #9: suicide, bku, tractors, naresh, enter, naresh tikait, enter delhi
Topic #10: rise, future, asia, increase, largest, world, opportunity
Topic #11: kisan, demands, waiver, loan waiver, loan, march, thousands farmers
Topic #12: apex, apex court, shiv, originates, judge, dispute tribunal, flags
Topic #13: law order, constitution, law, monday, farm sector, violent protests, indira
Topic #14: high

In [162]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(TOPICS)]
).astype(float)
topic_df.index = wri.index
wri = pd.concat([wri, topic_df], axis=1)

In [176]:
for i in range(TOPICS):
    column='topic_{}'.format(i+1)
    print(wri[column].value_counts(normalize=True))

0.0    0.686404
1.0    0.313596
Name: topic_1, dtype: float64
0.0    0.699561
1.0    0.300439
Name: topic_2, dtype: float64
0.0    0.892544
1.0    0.107456
Name: topic_3, dtype: float64
0.0    0.828947
1.0    0.171053
Name: topic_4, dtype: float64
0.0    0.826754
1.0    0.173246
Name: topic_5, dtype: float64
0.0    0.85307
1.0    0.14693
Name: topic_6, dtype: float64
0.0    0.638158
1.0    0.361842
Name: topic_7, dtype: float64
0.0    0.842105
1.0    0.157895
Name: topic_8, dtype: float64
0.0    0.85307
1.0    0.14693
Name: topic_9, dtype: float64
0.0    0.785088
1.0    0.214912
Name: topic_10, dtype: float64
0.0    0.835526
1.0    0.164474
Name: topic_11, dtype: float64
0.0    0.844298
1.0    0.155702
Name: topic_12, dtype: float64
0.0    0.855263
1.0    0.144737
Name: topic_13, dtype: float64
0.0    0.85307
1.0    0.14693
Name: topic_14, dtype: float64
0.0    0.765351
1.0    0.234649
Name: topic_15, dtype: float64
0.0    0.842105
1.0    0.157895
Name: topic_16, dtype: float64
0.0    

In [177]:
#Topic Flags
wri['topic']=wri['topic_1']+wri['topic_2']+wri['topic_3']+wri['topic_4']+wri['topic_5']+wri['topic_6']+wri['topic_7']
wri['topic'].value_counts(normalize=True)

1.0    0.355263
2.0    0.258772
0.0    0.197368
3.0    0.107456
4.0    0.037281
5.0    0.032895
6.0    0.010965
Name: topic, dtype: float64

In [181]:
MisTagged = wri[wri['topic']==0]
MisTagged.to_csv("MissTagged.csv")
print(MisTagged)
print(MisTagged.info())


       labels                                               text  topic_1  \
3    positive  indiaUpdated Jan    ISTUnion home minister Raj...      0.0   
6    positive  New Delhi Mobs of Hindus and Muslims pelted st...      0.0   
7    positive  indiaUpdated Jan    ISTLast week AIADMK parlia...      0.0   
12   positive  By Saad SayeedISLAMABAD Reuters  Four Pakistan...      0.0   
14   positive  Ittefaq ReportA Bangladeshi youth has been all...      0.0   
25   positive  Chinese troops intrude into Arunachal with roa...      0.0   
29   positive  One of the two JaisheMohammed JeM militants ki...      0.0   
35   positive  By Vilas TokaleMumbai Jan  PTI An ambitious fa...      0.0   
38   positive  This story is from January  GUWAHATI Naga insu...      0.0   
39   positive   get death for  Maharashtra honour killings Na...      0.0   
44   positive  India stood firm on the principle of multilate...      0.0   
45   positive  Eds Adding DCPs quoteVadodara Jan  PTI At leas...      0.0   

In [182]:
for i in range(TOPICS):
    column='topic_{}'.format(i+1)
    print(MisTagged[column].value_counts(normalize=True))

0.0    1.0
Name: topic_1, dtype: float64
0.0    1.0
Name: topic_2, dtype: float64
0.0    1.0
Name: topic_3, dtype: float64
0.0    1.0
Name: topic_4, dtype: float64
0.0    1.0
Name: topic_5, dtype: float64
0.0    1.0
Name: topic_6, dtype: float64
0.0    1.0
Name: topic_7, dtype: float64
0.0    0.822222
1.0    0.177778
Name: topic_8, dtype: float64
0.0    0.922222
1.0    0.077778
Name: topic_9, dtype: float64
0.0    0.833333
1.0    0.166667
Name: topic_10, dtype: float64
0.0    0.866667
1.0    0.133333
Name: topic_11, dtype: float64
0.0    0.844444
1.0    0.155556
Name: topic_12, dtype: float64
0.0    0.866667
1.0    0.133333
Name: topic_13, dtype: float64
0.0    0.944444
1.0    0.055556
Name: topic_14, dtype: float64
0.0    0.777778
1.0    0.222222
Name: topic_15, dtype: float64
0.0    0.777778
1.0    0.222222
Name: topic_16, dtype: float64
0.0    0.877778
1.0    0.122222
Name: topic_17, dtype: float64
0.0    0.744444
1.0    0.255556
Name: topic_18, dtype: float64
0.0    0.877778
1.0   