# Practical 4
## Implement a suitable stemming algorithm based on chosen data set

In [1]:
from nlp_lib  import *
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import json
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Try reading a single line first to check the format
with open("dataset.json", 'r') as f:
    first_line = f.readline()
try:
    json.loads(first_line)  # Check if the first line is valid JSON
    ds = pd.read_json("dataset.json", lines=True)# If valid, read the entire file as a list of JSON objects
except ValueError:
    print("The file is not in a valid line-delimited JSON format.")
    
ds.shape

(124989, 6)

In [3]:
ds.head()

Unnamed: 0,short_description,headline,date,link,authors,category
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,2018-05-26,https://www.huffingtonpost.com/entry/texas-ama...,Melissa Jeltsen,CRIME
1,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,2018-05-26,https://www.huffingtonpost.com/entry/will-smit...,Andy McDonald,ENTERTAINMENT
2,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 57,2018-05-26,https://www.huffingtonpost.com/entry/hugh-gran...,Ron Dicker,ENTERTAINMENT
3,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,2018-05-26,https://www.huffingtonpost.com/entry/jim-carre...,Ron Dicker,ENTERTAINMENT
4,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,2018-05-26,https://www.huffingtonpost.com/entry/julianna-...,Ron Dicker,ENTERTAINMENT


### Lower Casing

In [4]:
df=ds[['short_description','headline']]
#df.columns=['short_description','headline']
df['short_description']=df['short_description'].str.lower()

In [5]:
df.head()

Unnamed: 0,short_description,headline
0,she left her husband. he killed their children...,There Were 2 Mass Shootings In Texas Last Week...
1,of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...
2,the actor and his longtime girlfriend anna ebe...,Hugh Grant Marries For The First Time At Age 57
3,the actor gives dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,"the ""dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...


### Removing HTML Tags

In [6]:
df['short_description']=df['short_description'].apply(remove_html_tags)
df.head()

Unnamed: 0,short_description,headline
0,she left her husband. he killed their children...,There Were 2 Mass Shootings In Texas Last Week...
1,of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...
2,the actor and his longtime girlfriend anna ebe...,Hugh Grant Marries For The First Time At Age 57
3,the actor gives dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,"the ""dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...


### Removing URLs

In [7]:
df['short_description']=df['short_description'].apply(remove_url)
df.head()

Unnamed: 0,short_description,headline
0,she left her husband. he killed their children...,There Were 2 Mass Shootings In Texas Last Week...
1,of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...
2,the actor and his longtime girlfriend anna ebe...,Hugh Grant Marries For The First Time At Age 57
3,the actor gives dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,"the ""dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...


### Removing Punctuations

In [8]:
df['short_description']=df['short_description'].apply(remove_punctuations)
df.head()

Unnamed: 0,short_description,headline
0,she left her husband he killed their children ...,There Were 2 Mass Shootings In Texas Last Week...
1,of course it has a song,Will Smith Joins Diplo And Nicky Jam For The 2...
2,the actor and his longtime girlfriend anna ebe...,Hugh Grant Marries For The First Time At Age 57
3,the actor gives dems an asskicking for not fig...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,the dietland actress said using the bags is a ...,Julianna Margulies Uses Donald Trump Poop Bags...


### Removing Special Characters 

In [9]:
df['short_description']=df['short_description'].apply(removes_specials)
df.head()

Unnamed: 0,short_description,headline
0,she left her husband he killed their children ...,There Were 2 Mass Shootings In Texas Last Week...
1,of course it has a song,Will Smith Joins Diplo And Nicky Jam For The 2...
2,the actor and his longtime girlfriend anna ebe...,Hugh Grant Marries For The First Time At Age 57
3,the actor gives dems an asskicking for not fig...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,the dietland actress said using the bags is a ...,Julianna Margulies Uses Donald Trump Poop Bags...


### Removing Non-Printable Characters

In [10]:
df['short_description']=df['short_description'].apply(removes_non_printables)
df.head()

Unnamed: 0,short_description,headline
0,she left her husband he killed their children ...,There Were 2 Mass Shootings In Texas Last Week...
1,of course it has a song,Will Smith Joins Diplo And Nicky Jam For The 2...
2,the actor and his longtime girlfriend anna ebe...,Hugh Grant Marries For The First Time At Age 57
3,the actor gives dems an asskicking for not fig...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,the dietland actress said using the bags is a ...,Julianna Margulies Uses Donald Trump Poop Bags...


### Removing Stop Words

In [11]:
df['stopwords_removed']=df['short_description'].apply(remove_stopwords)
df.head()

Unnamed: 0,short_description,headline,stopwords_removed
0,she left her husband he killed their children ...,There Were 2 Mass Shootings In Texas Last Week...,left husband killed children another day america
1,of course it has a song,Will Smith Joins Diplo And Nicky Jam For The 2...,course song
2,the actor and his longtime girlfriend anna ebe...,Hugh Grant Marries For The First Time At Age 57,actor longtime girlfriend anna eberstein tied ...
3,the actor gives dems an asskicking for not fig...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,actor gives dems asskicking fighting hard enou...
4,the dietland actress said using the bags is a ...,Julianna Margulies Uses Donald Trump Poop Bags...,dietland actress said using bags really cathar...


### Text Tokenization

In [12]:
df[['short_description','headline','stopwords_removed']] = \
df[['short_description','headline','stopwords_removed']].where(df['stopwords_removed'].str.split().str.len() > 1)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.shape

(102480, 3)

In [13]:
df['tokenized']=df['stopwords_removed'].apply(tokenized_text)
df.drop(columns=["headline"]).head()

Unnamed: 0,short_description,stopwords_removed,tokenized
0,she left her husband he killed their children ...,left husband killed children another day america,"[left, husband, killed, children, another, day..."
1,of course it has a song,course song,"[course, song]"
2,the actor and his longtime girlfriend anna ebe...,actor longtime girlfriend anna eberstein tied ...,"[actor, longtime, girlfriend, anna, eberstein,..."
3,the actor gives dems an asskicking for not fig...,actor gives dems asskicking fighting hard enou...,"[actor, gives, dems, asskicking, fighting, har..."
4,the dietland actress said using the bags is a ...,dietland actress said using bags really cathar...,"[dietland, actress, said, using, bags, really,..."


### Implementing PorterStemmer to do stemming

In [14]:
ps = PorterStemmer()
def porter_stemming(tokenised_text):
    if isinstance(tokenised_text, list):  # Check if it's a list
        return [ps.stem(word) for word in tokenised_text]  # Apply stemming word by word
    else:
        return np.nan 
df['porter_stemmed']=df['tokenized'].apply(porter_stemming)
df.drop(columns=["headline"]).head()

Unnamed: 0,short_description,stopwords_removed,tokenized,porter_stemmed
0,she left her husband he killed their children ...,left husband killed children another day america,"[left, husband, killed, children, another, day...","[left, husband, kill, children, anoth, day, am..."
1,of course it has a song,course song,"[course, song]","[cours, song]"
2,the actor and his longtime girlfriend anna ebe...,actor longtime girlfriend anna eberstein tied ...,"[actor, longtime, girlfriend, anna, eberstein,...","[actor, longtim, girlfriend, anna, eberstein, ..."
3,the actor gives dems an asskicking for not fig...,actor gives dems asskicking fighting hard enou...,"[actor, gives, dems, asskicking, fighting, har...","[actor, give, dem, asskick, fight, hard, enoug..."
4,the dietland actress said using the bags is a ...,dietland actress said using bags really cathar...,"[dietland, actress, said, using, bags, really,...","[dietland, actress, said, use, bag, realli, ca..."


### Applying Other Stemming Functions

In [15]:
df['lancaster_stemmed']=df['tokenized'].apply(lancaster_stemming)
df.drop(columns=["headline","stopwords_removed"]).head()

Unnamed: 0,short_description,tokenized,porter_stemmed,lancaster_stemmed
0,she left her husband he killed their children ...,"[left, husband, killed, children, another, day...","[left, husband, kill, children, anoth, day, am...","[left, husband, kil, childr, anoth, day, americ]"
1,of course it has a song,"[course, song]","[cours, song]","[cours, song]"
2,the actor and his longtime girlfriend anna ebe...,"[actor, longtime, girlfriend, anna, eberstein,...","[actor, longtim, girlfriend, anna, eberstein, ...","[act, longtim, girlfriend, ann, eberstein, tie..."
3,the actor gives dems an asskicking for not fig...,"[actor, gives, dems, asskicking, fighting, har...","[actor, give, dem, asskick, fight, hard, enoug...","[act, giv, dem, asskick, fight, hard, enough, ..."
4,the dietland actress said using the bags is a ...,"[dietland, actress, said, using, bags, really,...","[dietland, actress, said, use, bag, realli, ca...","[dietland, actress, said, us, bag, real, catha..."


In [16]:
df['snowball_stemming']=df['tokenized'].apply(snowball_stemming)
df.drop(columns=["headline","stopwords_removed","tokenized"]).head()

Unnamed: 0,short_description,porter_stemmed,lancaster_stemmed,snowball_stemming
0,she left her husband he killed their children ...,"[left, husband, kill, children, anoth, day, am...","[left, husband, kil, childr, anoth, day, americ]","[left, husband, kill, children, anoth, day, am..."
1,of course it has a song,"[cours, song]","[cours, song]","[cours, song]"
2,the actor and his longtime girlfriend anna ebe...,"[actor, longtim, girlfriend, anna, eberstein, ...","[act, longtim, girlfriend, ann, eberstein, tie...","[actor, longtim, girlfriend, anna, eberstein, ..."
3,the actor gives dems an asskicking for not fig...,"[actor, give, dem, asskick, fight, hard, enoug...","[act, giv, dem, asskick, fight, hard, enough, ...","[actor, give, dem, asskick, fight, hard, enoug..."
4,the dietland actress said using the bags is a ...,"[dietland, actress, said, use, bag, realli, ca...","[dietland, actress, said, us, bag, real, catha...","[dietland, actress, said, use, bag, realli, ca..."


In [17]:
df['regexp_stemmed']=df['tokenized'].apply(regexp_stemming)
df.drop(columns=["headline","stopwords_removed","tokenized"]).head()

Unnamed: 0,short_description,porter_stemmed,lancaster_stemmed,snowball_stemming,regexp_stemmed
0,she left her husband he killed their children ...,"[left, husband, kill, children, anoth, day, am...","[left, husband, kil, childr, anoth, day, americ]","[left, husband, kill, children, anoth, day, am...","[left, husband, killed, children, another, day..."
1,of course it has a song,"[cours, song]","[cours, song]","[cours, song]","[cours, song]"
2,the actor and his longtime girlfriend anna ebe...,"[actor, longtim, girlfriend, anna, eberstein, ...","[act, longtim, girlfriend, ann, eberstein, tie...","[actor, longtim, girlfriend, anna, eberstein, ...","[actor, longtim, girlfriend, anna, eberstein, ..."
3,the actor gives dems an asskicking for not fig...,"[actor, give, dem, asskick, fight, hard, enoug...","[act, giv, dem, asskick, fight, hard, enough, ...","[actor, give, dem, asskick, fight, hard, enoug...","[actor, give, dem, asskick, fight, hard, enoug..."
4,the dietland actress said using the bags is a ...,"[dietland, actress, said, use, bag, realli, ca...","[dietland, actress, said, us, bag, real, catha...","[dietland, actress, said, use, bag, realli, ca...","[dietland, actres, said, us, bag, really, cath..."


### Applying Lemmatization using WordNetLemmatizer

In [18]:
lemmatizer= WordNetLemmatizer()
def wordnet_lemmatizing(tokenised_text):
    if isinstance(tokenised_text, list):  
        return [lemmatizer.lemmatize(word) for word in tokenised_text]  
    else:
        return np.nan 
df['lemmatized']=df['tokenized'].apply(wordnet_lemmatizing)
df.drop(columns=["headline","stopwords_removed","tokenized"]).head()

Unnamed: 0,short_description,porter_stemmed,lancaster_stemmed,snowball_stemming,regexp_stemmed,lemmatized
0,she left her husband he killed their children ...,"[left, husband, kill, children, anoth, day, am...","[left, husband, kil, childr, anoth, day, americ]","[left, husband, kill, children, anoth, day, am...","[left, husband, killed, children, another, day...","[left, husband, killed, child, another, day, a..."
1,of course it has a song,"[cours, song]","[cours, song]","[cours, song]","[cours, song]","[course, song]"
2,the actor and his longtime girlfriend anna ebe...,"[actor, longtim, girlfriend, anna, eberstein, ...","[act, longtim, girlfriend, ann, eberstein, tie...","[actor, longtim, girlfriend, anna, eberstein, ...","[actor, longtim, girlfriend, anna, eberstein, ...","[actor, longtime, girlfriend, anna, eberstein,..."
3,the actor gives dems an asskicking for not fig...,"[actor, give, dem, asskick, fight, hard, enoug...","[act, giv, dem, asskick, fight, hard, enough, ...","[actor, give, dem, asskick, fight, hard, enoug...","[actor, give, dem, asskick, fight, hard, enoug...","[actor, give, dems, asskicking, fighting, hard..."
4,the dietland actress said using the bags is a ...,"[dietland, actress, said, use, bag, realli, ca...","[dietland, actress, said, us, bag, real, catha...","[dietland, actress, said, use, bag, realli, ca...","[dietland, actres, said, us, bag, really, cath...","[dietland, actress, said, using, bag, really, ..."


In [19]:
df['procss']=df['lemmatized'].apply(list_joint)

In [20]:
df[["short_description","tokenized","procss"]].head()

Unnamed: 0,short_description,tokenized,procss
0,she left her husband he killed their children ...,"[left, husband, killed, children, another, day...",left husband killed child another day america
1,of course it has a song,"[course, song]",course song
2,the actor and his longtime girlfriend anna ebe...,"[actor, longtime, girlfriend, anna, eberstein,...",actor longtime girlfriend anna eberstein tied ...
3,the actor gives dems an asskicking for not fig...,"[actor, gives, dems, asskicking, fighting, har...",actor give dems asskicking fighting hard enoug...
4,the dietland actress said using the bags is a ...,"[dietland, actress, said, using, bags, really,...",dietland actress said using bag really cathart...
