In [1]:
import pandas as pd

Importing datasets

In [2]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1elNqcb5tW9snmf-xRfMK0AvLUfEcOhC1'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('news_summary.csv')
file_id = '1rAKmHTbGPOeuREC7Olk5oNHjNrdFHKkz'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('news_summary_more.csv')

In [3]:
data = pd.read_csv('news_summary.csv', encoding='latin-1') #check encodings types
more_data = pd.read_csv('news_summary_more.csv', encoding='latin-1')

#Exploring the datasets

## news_summary dataset


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4514 entries, 0 to 4513
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   author     4514 non-null   object
 1   date       4514 non-null   object
 2   headlines  4514 non-null   object
 3   read_more  4514 non-null   object
 4   text       4514 non-null   object
 5   ctext      4396 non-null   object
dtypes: object(6)
memory usage: 211.7+ KB


Nan values is found in the complete text 

In [5]:
data.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [6]:
duplicateRows1 = data[data.duplicated(subset=['ctext'])]
print('complete text duplicates')
print(duplicateRows1)

complete text duplicates
                  author  ...                                              ctext
42          Chhavi Tyagi  ...  The Daman and Diu administration on Wednesday ...
190         Chhavi Tyagi  ...  Charges and counter charges flew in the Lok Sa...
231   Niharika Prabhakar  ...                                                NaN
286        Saloni Tandon  ...                                                NaN
368         Chhavi Tyagi  ...  Bihar chief minister Nitish Kumar comfortably ...
...                  ...  ...                                                ...
4381        Chhavi Tyagi  ...  Rounding off a day of hectic electioneering in...
4423      Mansha Mahajan  ...                                                NaN
4454     Abhishek Bansal  ...                                                NaN
4500      Mansha Mahajan  ...                                                NaN
4508        Tarun Khanna  ...                                                NaN

[1

In [7]:
print(str(data[4283:4284]['text']))
print(str(data[4285:4286]['text']))

4283    Elections in Goa ended up in a hung Assembly, ...
Name: text, dtype: object
4285    Uttar Pradesh Chief Minister Akhilesh Yadav on...
Name: text, dtype: object


**duplicates and Nans are found in the complete text attribute wich will not be used. The `'ctex'`column will be dropped any way and it is okay to have different summaries and headlines for the same ctext.**

**still need to look at examples of these duplicates to make sure**

In [8]:
duplicateRows2 = data[data.duplicated(subset=['text'])]
print('summary text duplicates',duplicateRows2)


summary text duplicates Empty DataFrame
Columns: [author, date, headlines, read_more, text, ctext]
Index: []


In [9]:
duplicateRows3 = data[data.duplicated(subset=['headlines'])]
print('headlines duplicates',duplicateRows3)

headlines duplicates Empty DataFrame
Columns: [author, date, headlines, read_more, text, ctext]
Index: []


In [10]:
selected_features = data[['headlines','text']]
selected_features.head()

Unnamed: 0,headlines,text
0,Daman & Diu revokes mandatory Rakshabandhan in...,The Administration of Union Territory Daman an...
1,Malaika slams user who trolled her for 'divorc...,Malaika Arora slammed an Instagram user who tr...
2,'Virgin' now corrected to 'Unmarried' in IGIMS...,The Indira Gandhi Institute of Medical Science...
3,Aaj aapne pakad liya: LeT man Dujana before be...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotel staff to get training to spot signs of s...,Hotels in Maharashtra will train their staff t...


In [11]:
selected_features.isnull().values.any()


False

**Small dataset is cleaned**

## news_summary_more dataset




In [12]:
more_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98401 entries, 0 to 98400
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headlines  98401 non-null  object
 1   text       98401 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [13]:
more_data.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [28]:
duplicateRows_more = more_data[more_data.duplicated()]
print(duplicateRows_more)

headlines duplicates                                                headlines                                               text
51012  Longest Test innings was longer than 10 footba...  Former Pakistani cricketer Hanif Mohammad batt...
53731  Captain once dropped himself, replacement brok...  England captain Mike Denness left himself out ...
55374  India marks National Mathematics Day in memory...  National Mathematics Day is celebrated on Dece...
58769  Indian bowler once took 2 hat-tricks in a sing...  Former Services' medium-pacer Joginder Rao pic...
59705  Why is England-Australia Test series called 'T...  England was beaten at home for the first time ...
60724  Sachin's debut was not telecast in India due t...  The Test in which Sachin Tendulkar made his in...
60885  Who is the cricketer with longest known surnam...  Fijian cricketer Ilikena Lasarusa Talebulamain...
62063  An ODI was once played in the middle of a Test...  Zimbabwe played an ODI against New Zealand in ...
62575  

In [30]:
more_data.drop_duplicates(inplace=True)
duplicateRows_more = more_data[more_data.duplicated()]
print(duplicateRows_more)

Empty DataFrame
Columns: [headlines, text]
Index: []


In [40]:
duplicateRows_more = more_data[0:10000][more_data[0:10000].duplicated(subset='headlines',keep = False)]
duplicateRows_more

Unnamed: 0,headlines,text
23,Isha Ambani features on February cover of Vogu...,Reliance Industries' Chairman Mukesh Ambani's ...
24,Indian Oil looking for annual deal to buy crud...,Indian Oil Corporation on Wednesday said it's ...
3292,Isha Ambani features on February cover of Vogu...,Reliance Industries' Chairman Mukesh Ambani's ...
3293,Indian Oil looking for annual deal to buy crud...,Indian Oil Corporation on Wednesday said it's ...


##All Data (the 2 datasets combined)

In [20]:
all_data = pd.concat([selected_features, more_data], ignore_index=True)
all_data.shape

(102915, 2)

In [16]:
duplicateRowsDF = all_data[all_data.duplicated()]
print(duplicateRowsDF)

                                                headlines                                               text
55526   Longest Test innings was longer than 10 footba...  Former Pakistani cricketer Hanif Mohammad batt...
58245   Captain once dropped himself, replacement brok...  England captain Mike Denness left himself out ...
59888   India marks National Mathematics Day in memory...  National Mathematics Day is celebrated on Dece...
63283   Indian bowler once took 2 hat-tricks in a sing...  Former Services' medium-pacer Joginder Rao pic...
64219   Why is England-Australia Test series called 'T...  England was beaten at home for the first time ...
...                                                   ...                                                ...
102827  Shreya Ghoshal to get wax figure at Madame Tus...  Singer Shreya Ghoshal is set to get a wax figu...
102841             Ranchi becomes India's 26th Test venue  Ranchi's JSCA Stadium has become India's 26th ...
102847  Dhoni signs

In [25]:
all_data.drop_duplicates(inplace=True)
duplicateRowsDF = all_data[all_data.duplicated()]
print(duplicateRowsDF)

Empty DataFrame
Columns: [headlines, text]
Index: []


In [26]:
duplicateRowsDF = all_data[all_data.duplicated(subset= 'text',keep = False)]
print(duplicateRowsDF)

                                                headlines                                               text
141     Qatar complaints to WTO against Saudi-led trad...  Qatar on Monday filed a legal complaint at the...
169      Don?t make lynchistan out of Hindustan: Congress  Congress leader Mallikarjun Kharge on Monday s...
176     PM Modi can?t take over states: Kiren Rijiju o...  Minister of State for Home Affairs, Kiren Riji...
415     MP ?miracle baby? was buried alive by rape sur...  A baby boy, who survived after being buried al...
645     Mithali Raj asks for women?s IPL after World C...  India women's team captain Mithali Raj called ...
...                                                   ...                                                ...
89562   IndiaÃ¢ÂÂs Red fort shown as PakistanÃ¢ÂÂs...  The Red Fort with the Indian tricolour was sho...
95182   We are hopeful weÃ¢ÂÂll convince Vishwas: Ke...  Amid the ongoing rift in the Aam Aadmi Party, ...
95189   DonÃ¢ÂÂt 

In [19]:
all_data.isnull().values.any()


False