In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from textblob import Word

In [3]:
train_df = pd.read_csv('train_file.csv')
test_df = pd.read_csv('test_file.csv')
train_df.head()

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084


In [4]:
#converting all the character in to lower case
train_df['Title'] = train_df['Title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train_df['Headline'] = train_df['Headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test_df['Title'] = test_df['Title'].apply(lambda x: " ".join(x.lower() for x in x.split()))
test_df['Headline'] = test_df['Headline'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train_df['Title'].head()

0     obama lays wreath at arlington national cemetery
1          a look at the health of the chinese economy
2     nouriel roubini: global economy not back to 2008
3                            finland gdp expands in q4
4    tourism, govt spending buoys thai economy in j...
Name: Title, dtype: object

In [5]:
#removing punctuation from the catagorical data
train_df['Title'] = train_df['Title'].str.replace('[^\w\s]','')
train_df['Headline'] = train_df['Headline'].str.replace('[^\w\s]','')
test_df['Title'] = test_df['Title'].str.replace('[^\w\s]','')
test_df['Headline'] = test_df['Headline'].str.replace('[^\w\s]','')
test_df['Title'].head()

0    sliding economy fg fights back with n3trn tsa ...
1    microsoft shows how hololens can bring distant...
2    microsofts twitter robot praises hitler trump ...
3    flood of central bank moves cant get world eco...
4    usdjpy bears lining up on mixed us economy out...
Name: Title, dtype: object

In [8]:
#Removing stopwords from the categorical data
stop = stopwords.words('english')
train_df['Title'] = train_df['Title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train_df['Headline'] = train_df['Headline'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test_df['Title'] = test_df['Title'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test_df['Headline'] = test_df['Headline'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
test_df['Title'].head()

0       sliding economy fg fights back n3trn tsa funds
1    microsoft shows hololens bring distant family ...
2    microsofts twitter robot praises hitler trump ...
3    flood central bank moves cant get world econom...
4         usdjpy bears lining mixed us economy outlook
Name: Title, dtype: object

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Sourabh
[nltk_data]     Rajey\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [9]:
#stemming the words so all forms of word will have same meaning
from nltk.stem import PorterStemmer
st = PorterStemmer()
train_df['Title'] = train_df['Title'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
train_df['Headline'] = train_df['Headline'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
test_df['Title'] = test_df['Title'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
test_df['Headline'] = test_df['Headline'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
test_df['Title'].head()

0           slide economi fg fight back n3trn tsa fund
1    microsoft show hololen bring distant famili me...
2    microsoft twitter robot prais hitler trump rec...
3    flood central bank move cant get world economi...
4              usdjpi bear line mix us economi outlook
Name: Title, dtype: object

In [10]:
#indexing of topic feature
train_df['TopicIndex'] = pd.factorize(train_df['Topic'], sort = True)[0] + 1
test_df['TopicIndex'] = pd.factorize(test_df['Topic'], sort = True)[0] + 1
train_df.head(10)

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline,TopicIndex
0,Tr3CMgRv1N,obama lay wreath arlington nation cemeteri,obama lay wreath arlington nation cemeteri pre...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533,3
1,Wc81vGp8qZ,look health chines economi,tim haywood invest director businessunit head ...,Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386,1
2,zNGH03CrZH,nouriel roubini global economi back 2008,nouriel roubini nyu professor chairman roubini...,Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754,1
3,3sM1H0W8ts,finland gdp expand q4,finland economi expand margin three month end ...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064,1
4,wUbnxgvqaZ,tourism govt spend buoy thai economi januari,tourism public spend continu boost economi jan...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084,1
5,1P7kLrnWEp,intellitec solut host 13th annual spring micro...,100 attende expect see latest version microsof...,PRWeb,microsoft,2015-03-01 00:19:00,-1,-1,-1,-0.075378,0.036773,2
6,lKg2pImhCl,monday 29 feb 2016,ramallah februari 25 2016 wafa palestin liber ...,,palestine,2016-02-28 14:03:00,0,0,0,0.0,-0.005906,4
7,X2KssRh8hS,obama star pay music tribut ray charl,first ladi michel obama speak state din room w...,Coast Reporter,obama,2015-03-01 00:45:00,-1,-1,-1,0.083333,0.103003,3
8,xCHOHAl8v8,fire claim 100yearold barn hancock counti,hancock counti man lost barn earli monday morn...,WTHR Indianapolis,palestine,2015-03-01 01:20:00,-1,-1,-1,-0.173925,-0.050185,4
9,OR6Xuthveg,microsoft new window 10 ad target appl,new delhi feb29 technolog giant microsoft targ...,New Kerala,microsoft,2015-03-01 01:32:00,-1,-1,-1,-0.059536,-0.081715,2


In [12]:
#combining the numerical feature and categorical feature using DataFrameMapper
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
train_df = train_df.fillna('')

In [14]:
#converting title feature in to the vector form using TfidVectorizer so we can input this feature in to our model
mapper_title = DataFrameMapper([
    ('Title', TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))),
    ('Facebook', None),
    ('GooglePlus', None),
    ('LinkedIn', None),
    ('TopicIndex', None),
], default = False)

In [15]:
#converting Headline in to the vector form using TfidVectorizer so we can input this feature in to our model
mapper_headline = DataFrameMapper([
    ('Headline', TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))),
    ('Facebook', None),
    ('GooglePlus', None),
    ('LinkedIn', None),
    ('TopicIndex', None),
], default = False)

In [16]:

features_title = mapper_title.fit_transform(train_df)
labels_title = train_df['SentimentTitle']
test_features_title = mapper_title.transform(test_df)

features_headline = mapper_headline.fit_transform(train_df)
labels_headline = train_df['SentimentHeadline']
test_features_headline = mapper_headline.transform(test_df)

In [17]:
# Apply Train-Test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features_title, labels_title, test_size=0.30, random_state=42)

In [18]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

In [20]:

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
estimator = SVR(gamma = 'auto')
#clf_svr_1 = GridSearchCV(estimator, parameters)
estimator.fit(x_train, y_train)
predictions_svr = estimator.predict(x_test)
mean_absolute_error(y_test,predictions_svr)

0.09422634027204627

In [21]:
#training on whole dataSet
estimator.fit(features_title, labels_title)
final_pred_title = estimator.predict(test_features_title)

estimator.fit(features_headline, labels_headline)
final_pred_headline = estimator.predict(test_features_headline)

In [23]:
#storing the data frame in final_output.csv file
final = pd.DataFrame({'IDLink': test_df['IDLink'], 'SentimentTitle': list(final_pred_title), 'SentimentHeadline': list(final_pred_headline)})
final.to_csv('final_output_ZS.csv',  encoding='utf-8', index=False)