In [21]:
import pandas as pd
import glob
import os

In [22]:
def readMessagesFromFile(fPath, colNames, **kwargs):
    """Reads a file with messages and corresponding sentiments and
    returns a pandas DataFrame.

    fPath    : Path to the file to read messages from.
    colNames : Column names for the returned DataFrame.
    Returns  : Pandas DataFrame containing messages and sentiments
               as columns.

    Sample Output:
                                                 message  sentiment
    0                           Wow... Loved this place.          1
    1  I learned that if an electric slicer is used t...        NaN
    2                   But they don't clean the chiles?        NaN
    3                                 Crust is not good.          0
    4          Not tasty and the texture was just nasty.          0

    """
    
    df = pd.read_csv(fPath,names=colNames,header=None,sep='\t')
    
    return df


In [23]:
readMessagesFromFile('C:/Users/Suhas/Downloads/recruitment-task/data/amazon_cells_labelled.txt',['message','sentiment'])

Unnamed: 0,message,sentiment
0,I try not to adjust the volume setting to avoi...,
1,So there is no way for me to plug it in here i...,0.0
2,"Good case, Excellent value.",1.0
3,I thought Motorola made reliable products!.,
4,Battery for Motorola Razr.,
5,Great for the jawbone.,1.0
6,When I got this item it was larger than I thou...,
7,(I looked for one that specifically said DCU-6...,
8,The first time it was turned on the screen dis...,
9,In some programs clicking it is the same as hi...,


In [24]:
def findFilesInDir(dirPath, pattern):
    """Finds the list of filenames in `dirPath` that match `pattern`.

    dirPath : Path to the directory to find files in.
    pattern : A glob pattern to match filenames against.
    Returns : List of filenames in `dirPath` that match `pattern`.
    """
    text_files = [f for f in os.listdir(dirPath) if f.endswith(pattern)]
    return text_files


In [25]:
findFilesInDir('C:\\Users\\Suhas\\Downloads\\recruitment-task\\data','.txt')

['amazon_cells_labelled.txt',
 'imdb_labelled.txt',
 'readme.txt',
 'yelp_labelled.txt']

In [26]:
import glob
ll = glob.glob('C:/Users/Suhas/Downloads/recruitment-task/data/*_labelled.txt')

In [27]:
# all_txt_files = list(filter(lambda x: x.endswith('.txt'), os.listdir(k)))
# all_txt_files

In [28]:
def readMessagesFromDir(dirPath, fileNamePattern, dfColNames, **kwargs):
    """For files in directory that match a pattern, return a dictionary of
    pandas DataFrames labeled by the filepaths.

    dirPath         : Path to the directory to find files in.
    fileNamePattern : A glob pattern to match filenames against.
    dfColNames      : Column names for the returned DataFrame.
    Returns         : Dictionary of DataFrames labeled by the filepaths.

    """
    import glob
    filepaths = glob.glob(dirPath + fileNamePattern)
    d = {}
    for fp in filepaths:
        df = pd.read_csv(fp,names=dfColNames,header=None,sep='\t')
        d[fp] = df
        
    return d


In [29]:
col = ['messages','sentiment']

DFDict = readMessagesFromDir('C:/Users/Suhas/Downloads/recruitment-task/data/','*_labelled.txt',col)

In [30]:
DFDict

{'C:/Users/Suhas/Downloads/recruitment-task/data\\amazon_cells_labelled.txt':                                                 messages  sentiment
 0      I try not to adjust the volume setting to avoi...        NaN
 1      So there is no way for me to plug it in here i...        0.0
 2                            Good case, Excellent value.        1.0
 3            I thought Motorola made reliable products!.        NaN
 4                             Battery for Motorola Razr.        NaN
 5                                 Great for the jawbone.        1.0
 6      When I got this item it was larger than I thou...        NaN
 7      (I looked for one that specifically said DCU-6...        NaN
 8      The first time it was turned on the screen dis...        NaN
 9      In some programs clicking it is the same as hi...        NaN
 10     Tied to charger for conversations lasting more...        0.0
 11                                     The mic is great.        1.0
 12     What happened was 

In [31]:
# df = pd.DataFrame(
#         {'col1': [1, 2], 'col2': [0.5, 0.75]}, index=['a', 'b'])

# frames = {}

# groups = ['123', '456']
# for grp in groups:
#     #do some calcs to get a dataframe called 'df'
#     frames[grp] = df

In [32]:
# Step 2: Compiling the corpus
# -----------------------------------------------------------------------------


def makeLabel(fromFilePath):
    """Make label for file from file path.

    fromFilePath : Input file path.
    Returns      : String label.

    Sample output:

    '../abc/xyz_labelled.txt' -> 'xyz'
    """
    
    return (os.path.basename(fromFilePath)).split("_labelled")[0]


def concatDataFrames(msgDFDict):
    """Concatenate DataFrames by rows adding a column that indicates the
    source of the message. Make sure that the index for each row is unique.
    
    Sample output:

                                             messages  sentiment         label
    The CG opening sequence in space looked like i...          0          imdb
    Then one day, I went to use them and the recie...        NaN  amazon_cells
    And the pho is much better when it is served f...        NaN          yelp
    I have always had cases for my cell phones bec...        NaN  amazon_cells
                    I'll let you know how it goes....        NaN          yelp
                                  It looks very nice.          1  amazon_cells
        The Veggitarian platter is out of this world!          1          yelp
                  It looked like a wonderful story.            1          imdb
                       Too much hassle for my liking.        NaN  amazon_cells
    So far so good with this one, plus the glowing...        NaN  amazon_cells
    """
    old_keys = [*msgDFDict.keys()]

    new_labels = [makeLabel(x) for x in old_keys]

    for i in range(len(old_keys)):
        msgDFDict[new_labels[i]] = msgDFDict.pop(old_keys[i])
    
    dd1 = { k:v for k, v in msgDFDict.items()}

    df11 = pd.concat(dd1, axis=0)
    df11['labels'] = df11.index
    df11['labels'] = df11['labels'].apply(lambda x: x[0])
    df11.reset_index(drop = True, inplace = True)
    # 19086
    # 18828
    df11 = df11.drop_duplicates()
    df11.dropna(inplace=True)
    df11['sentiment'] = df11['sentiment'].astype('int64')
        
    return df11

# fp = glob.glob('C:/Users/Suhas/Downloads/recruitment-task/data/'+'*_labelled.txt')
# labels = makeLabel(fp)
# labels

# Use functions above to generate a DF which contains messages from
# all sources with corresponding sentiments and labels.

# Some of the messages do not have the sentiment populated. Let's drop
# these from the DF here.

In [33]:
msgsDF = concatDataFrames(DFDict)

In [34]:
msgsDF.shape

(2731, 3)

In [35]:
msgsDF.head()

Unnamed: 0,messages,sentiment,labels
1,So there is no way for me to plug it in here i...,0,amazon_cells
2,"Good case, Excellent value.",1,amazon_cells
5,Great for the jawbone.,1,amazon_cells
10,Tied to charger for conversations lasting more...,0,amazon_cells
11,The mic is great.,1,amazon_cells


In [36]:
# Step 3: Counting the vocabulary
# -----------------------------------------------------------------------------


def makeTermsFrom(msg):
    """Use this function to convert a message into vocabulary terms to
    avoid confusion regarding what is a valid term.
    """
    return [m for m in msg.lower().split() if m]


def countVocabulary(msgsDF):
    """Take a DF of messages, sentiments, and labels and return a DF with
    terms, sentiments, labels, and the corresponding counts. Write
    whatever helper functions are required to achieve this task.

    Sample output:
              term  sentiment         label  count
    0            !          0  amazon_cells      1
    1            !          0          yelp      1
    2            !          1  amazon_cells      1
    3            !          1          imdb      1
    4            !          1          yelp      1
    5           !!          1  amazon_cells      1
    6          !!!          1          yelp      1
    7     !....the          0          yelp      1
    8          !2.          1  amazon_cells      1
    9           !i          1  amazon_cells      1
    10       "1.2"          1  amazon_cells      1
    11        "10"          1          imdb      1
    12          "a          1          imdb      1
    13     "about"          1          imdb      1
    14     "acting          1          imdb      1
    15        "are          0          yelp      1
    16        "art          1          imdb      1
    17         "at          1          imdb      1
    18        "big          1          imdb      1
    19  "breeders"          0          imdb      1
    20      "clip"          0  amazon_cells      1
    21   "collect"          0          imdb      1
    22    "crumby"          0          yelp      1
    23        "don          1          imdb      1
    24        "eel          0          yelp      1
    """
    msgsDF['messages'] = msgsDF['messages'].apply(lambda x: makeTermsFrom(x))
    
    count_v = pd.DataFrame([(row.sentiment,row.labels, word) for row in msgsDF.itertuples() 
                          for word in row.messages], 
                         columns=['sentiment','labels','terms'])
    count_voc = count_v.groupby(['terms','sentiment'])['labels'].value_counts()
    count_vocab = pd.DataFrame({'count': count_voc})
    count_vocab_df = count_vocab.reset_index()
    
    return count_vocab_df


counts = countVocabulary(msgsDF)


In [37]:
counts

Unnamed: 0,terms,sentiment,labels,count
0,!,0,amazon_cells,1
1,!,0,yelp,1
2,!,1,amazon_cells,1
3,!,1,imdb,1
4,!,1,yelp,1
5,!!,1,amazon_cells,1
6,!!!,1,yelp,1
7,!....the,0,yelp,1
8,!2.,1,amazon_cells,1
9,!i,1,amazon_cells,1


In [41]:
counts.to_csv("Output data")

============================================================================================================

In [394]:
dft = msgsDF[:3].copy()

In [395]:
dft

Unnamed: 0,messages,sentiments,labels
1,So there is no way for me to plug it in here i...,0.0,amazon_cells
2,"Good case, Excellent value.",1.0,amazon_cells
5,Great for the jawbone.,1.0,amazon_cells


In [396]:
dft['labels'][2] = 'imdb'
dft['messages'][2] = 'Good case so Excellent value.'
dft['messages'][5] = 'so Great for the jawbone.'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [397]:
dft['messages'] = dft['messages'].apply(lambda x: makeTermsFrom(x))
dft

Unnamed: 0,messages,sentiments,labels
1,"[so, there, is, no, way, for, me, to, plug, it...",0.0,amazon_cells
2,"[good, case, so, excellent, value.]",1.0,imdb
5,"[so, great, for, the, jawbone.]",1.0,amazon_cells


In [407]:
dft.sentiments = dft.sentiments.astype('int64')


In [408]:
dft

Unnamed: 0,messages,sentiments,labels
1,"[so, there, is, no, way, for, me, to, plug, it...",0,amazon_cells
2,"[good, case, so, excellent, value.]",1,imdb
5,"[so, great, for, the, jawbone.]",1,amazon_cells


In [418]:
longframe = pd.DataFrame([(row.sentiments,row.labels, word) for row in dft.itertuples() 
                          for word in row.messages], 
                         columns=['sentiments', 'labels','terms'])

In [419]:
longframe

Unnamed: 0,sentiments,labels,terms
0,0,amazon_cells,so
1,0,amazon_cells,there
2,0,amazon_cells,is
3,0,amazon_cells,no
4,0,amazon_cells,way
5,0,amazon_cells,for
6,0,amazon_cells,me
7,0,amazon_cells,to
8,0,amazon_cells,plug
9,0,amazon_cells,it


In [423]:
count = longframe.groupby(['terms','sentiments'])['labels'].value_counts()

In [456]:
count

terms       sentiments  labels      
a           0           amazon_cells    1
by          0           amazon_cells    1
case        1           imdb            1
converter.  0           amazon_cells    1
excellent   1           imdb            1
for         0           amazon_cells    1
            1           amazon_cells    1
go          0           amazon_cells    1
good        1           imdb            1
great       1           amazon_cells    1
here        0           amazon_cells    1
i           0           amazon_cells    1
in          0           amazon_cells    2
is          0           amazon_cells    1
it          0           amazon_cells    1
jawbone.    1           amazon_cells    1
me          0           amazon_cells    1
no          0           amazon_cells    1
plug        0           amazon_cells    1
so          0           amazon_cells    1
            1           amazon_cells    1
                        imdb            1
the         0           amazon_cells   

In [426]:
result = pd.DataFrame({'count': count})
result = result.reset_index()
result

Unnamed: 0,terms,sentiments,labels,count
0,a,0,amazon_cells,1
1,by,0,amazon_cells,1
2,case,1,imdb,1
3,converter.,0,amazon_cells,1
4,excellent,1,imdb,1
5,for,0,amazon_cells,1
6,for,1,amazon_cells,1
7,go,0,amazon_cells,1
8,good,1,imdb,1
9,great,1,amazon_cells,1


In [459]:
result.terms[0]

'a'

In [311]:
df = pd.DataFrame(
    {'score': [5, 8], 'sentence': ["This is a sentence.", "Another sentence sentence?"]})
df['sentence'] = df['sentence'].str.findall(r'\w+')


In [312]:
df

Unnamed: 0,score,sentence
0,5,"[This, is, a, sentence]"
1,8,"[Another, sentence, sentence]"


In [313]:
longframe = pd.DataFrame([(row.score, word) for row in df.itertuples() 
                          for word in row.sentence], 
                         columns=['score', 'word'])

In [314]:
longframe

Unnamed: 0,score,word
0,5,This
1,5,is
2,5,a
3,5,sentence
4,8,Another
5,8,sentence
6,8,sentence
