In [1]:
import pandas as pd
import codecs
from sklearn.svm import SVR
from sklearn.feature_extraction.text import CountVectorizer
import re

pd.set_option('display.max_colwidth', -1)

In [2]:
def re_encode(source_file,
            source_encoding,
            destination_file,
            destination_encoding):
    """
    Change encoding of a file and write it as a new file
    """
    with codecs.open(source_file,
                    'r',
                    source_encoding) as source_handle:
        with codecs.open(destination_file,
                        'w',
                        destination_encoding) as destination_handle:
            contents = source_handle.read()
            destination_handle.write(contents)
            

In [3]:
def standardize_column_names(df):
    """
    Convert all columns to lowercase
    Replace spaces with underscores
    """
    prior_columns = list(df.columns)
    new_columns = [column_name.lower().replace(' ', '_') for column_name in prior_columns]
    df.columns = new_columns

In [93]:

def clean_html_body(text_body):
    """
    Pseudocode

    Remove <pre> tag section
    Remove <code> tag section
    Remove <anchor> tag section
    Extract all text from the html content
    Remove new line characters
    Return clean text
    """
    pre_tag_pattern = r'<pre>.*</pre>'
    code_tag_pattern = r'<code>.*</code>'
    anchor_tag_pattern = r'<a.*</a>'
    pre_tag_removed = re.sub(pre_tag_pattern, '',  text_body)
    code_tag_removed = re.sub(code_tag_pattern, '', pre_tag_removed)
    anchor_tag_removed = re.sub(anchor_tag_pattern, '', code_tag_removed)
    return BeautifulSoup(anchor_tag_removed).get_text().replace('\n', ' ').replace('\r', '')

In [4]:
# Re encoding files to utf-8 format            
            
re_encode('Data/Questions.csv',
         'macintosh',
         'Data/Questions_utf8.csv',
         'utf-8')       

re_encode('Data/Answers.csv',
         'macintosh',
         'Data/Answers_utf8.csv',
         'utf-8')  

re_encode('Data/Tags.csv',
         'macintosh',
         'Data/Tags_utf8.csv',
         'utf-8')  

In [5]:
questions = pd.read_csv('Data/Questions_utf8.csv',
                       encoding='utf-8')
answers = pd.read_csv('Data/Answers_utf8.csv')
tags = pd.read_csv('Data/Tags_utf8.csv')

# Standardizing columns
standardize_column_names(questions)
standardize_column_names(answers)
standardize_column_names(tags)

In [6]:
questions.head(5)

Unnamed: 0,id,owneruserid,creationdate,score,title,body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from its display name on a Mac?,"<p>I am using the Photoshop's javascript API to find the fonts in a given PSD.</p>\n\n<p>Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc.</p>\n\n<p>This is all happening in a python program running on OSX so I guess I'm looking for one of:</p>\n\n<ul>\n<li>Some Photoshop javascript</li>\n<li>A Python function</li>\n<li>An OSX API that I can call from python</li>\n</ul>\n"
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,"<p>I have a cross-platform (Python) application which needs to generate a JPEG preview of the first page of a PDF.</p>\n\n<p>On the Mac I am spawning <a href=""http://developer.apple.com/documentation/Darwin/Reference/ManPages/man1/sips.1.html"">sips</a>. Is there something similarly simple I can do on Windows?</p>\n"
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Codebase,"<p>I'm starting work on a hobby project with a python codebase and would like to set up some form of continuous integration (i.e. running a battery of test-cases each time a check-in is made and sending nag e-mails to responsible persons when the tests fail) similar to CruiseControl or TeamCity.</p>\n\n<p>I realize I could do this with hooks in most VCSes, but that requires that the tests run on the same machine as the version control server, which isn't as elegant as I would like. Does anyone have any suggestions for a small, user-friendly, open-source continuous integration system suitable for a Python codebase?</p>\n"
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a result set. What are the tradeoff of each?</p>\n
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python objects in an array,"<p>I don't remember whether I was dreaming or not but I seem to recall there being a function which allowed something like,</p>\r\n\r\n<pre><code>foo in iter_attr(array of python objects, attribute name)</code></pre>\r\n\r\n<p>I've looked over the docs but this kind of thing doesn't fall under any obvious listed headers</p>"


In [7]:
answers.head(5)

Unnamed: 0,id,owneruserid,creationdate,parentid,score,body
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilities-&gt;Terminal) and type this in:</p>\r\n\r\n<pre><code>locate InsertFontHere<br></code></pre>\r\n\r\n<p>This will spit out every file that has the name you want.</p>\r\n\r\n<p>Warning: there may be alot to wade through.</p>
1,518,153.0,2008-08-02T17:42:28Z,469,2,"<p>I haven't been able to find anything that does this directly. I think you'll have to iterate through the various font folders on the system: <code>/System/Library/Fonts</code>, <code>/Library/Fonts</code>, and there can probably be a user-level directory as well <code>~/Library/Fonts</code>.</p>\n"
2,536,161.0,2008-08-02T18:49:07Z,502,9,"<p>You can use ImageMagick's convert utility for this, see some examples in <a href=""https://web.archive.org/web/20120413111338/http://studio.imagemagick.org/pipermail/magick-users/2002-May/002636.html"" rel=""nofollow"">http://studio.imagemagick.org/pipermail/magick-users/2002-May/002636.html</a>\n:</p>\n\n<blockquote>\n<pre><code>Convert taxes.pdf taxes.jpg \n</code></pre>\n \n <p>Will convert a two page PDF file into [2] jpeg files: taxes.jpg.0,\n taxes.jpg.1</p>\n \n <p>I can also convert these JPEGS to a thumbnail as follows:</p>\n\n<pre><code>convert -size 120x120 taxes.jpg.0 -geometry 120x120 +profile '*' thumbnail.jpg\n</code></pre>\n \n <p>I can even convert the PDF directly to a jpeg thumbnail as follows:</p>\n\n<pre><code>convert -size 120x120 taxes.pdf -geometry 120x120 +profile '*' thumbnail.jpg\n</code></pre>\n \n <p>This will result in a thumbnail.jpg.0 and thumbnail.jpg.1 for the two\n pages.</p>\n</blockquote>\n"
3,538,156.0,2008-08-02T18:56:56Z,535,23,"<p>One possibility is Hudson. It's written in Java, but there's integration with Python projects:</p>\n\n<blockquote>\n <p><a href=""http://redsolo.blogspot.com/2007/11/hudson-embraces-python.html"" rel=""nofollow"">Hudson embraces Python</a></p>\n</blockquote>\n\n<p>I've never tried it myself, however.</p>\n\n<p>(<strong>Update</strong>, Sept. 2011: After a trademark dispute Hudson has been renamed to <a href=""http://jenkins-ci.org/"" rel=""nofollow"">Jenkins</a>.)</p>\n"
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">Buildbot - Trac</a> at work, I haven't used it too much since my code base isn't part of the release cycle yet. But we run the tests on different environments (OSX/Linux/Win) and it sends emails --and it's written in python.</p>"


In [8]:
tags.head(5)

Unnamed: 0,id,tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python


### Restructuring `tags` data frame

In [9]:
num_missing_tags = tags.tag.isnull().sum()
print(f'Number of questions missing tags: {num_missing_tags}')

Number of questions missing tags: 443


*Removing these rows from the data frame*

In [10]:
tags.dropna(inplace=True)

### Grouping tags based on question id

In [11]:
tags = tags[["id", "tag"]].groupby('id')['tag'].apply(' '.join).reset_index()
tags.head()

Unnamed: 0,id,tag
0,469,python osx fonts photoshop
1,502,python windows image pdf
2,535,python continuous-integration extreme-programming
3,594,python sql database oracle cx-oracle
4,683,python arrays iteration


### Merging `tags` df with `questions` df

In [12]:
df = pd.merge(left=questions,
                     right=tags,
                     how='left',
                     on='id')
df.rename(columns={'tag': 'tags'},
         inplace=True)
df.head()

Unnamed: 0,id,owneruserid,creationdate,score,title,body,tags
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from its display name on a Mac?,"<p>I am using the Photoshop's javascript API to find the fonts in a given PSD.</p>\n\n<p>Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc.</p>\n\n<p>This is all happening in a python program running on OSX so I guess I'm looking for one of:</p>\n\n<ul>\n<li>Some Photoshop javascript</li>\n<li>A Python function</li>\n<li>An OSX API that I can call from python</li>\n</ul>\n",python osx fonts photoshop
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,"<p>I have a cross-platform (Python) application which needs to generate a JPEG preview of the first page of a PDF.</p>\n\n<p>On the Mac I am spawning <a href=""http://developer.apple.com/documentation/Darwin/Reference/ManPages/man1/sips.1.html"">sips</a>. Is there something similarly simple I can do on Windows?</p>\n",python windows image pdf
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Codebase,"<p>I'm starting work on a hobby project with a python codebase and would like to set up some form of continuous integration (i.e. running a battery of test-cases each time a check-in is made and sending nag e-mails to responsible persons when the tests fail) similar to CruiseControl or TeamCity.</p>\n\n<p>I realize I could do this with hooks in most VCSes, but that requires that the tests run on the same machine as the version control server, which isn't as elegant as I would like. Does anyone have any suggestions for a small, user-friendly, open-source continuous integration system suitable for a Python codebase?</p>\n",python continuous-integration extreme-programming
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a result set. What are the tradeoff of each?</p>\n,python sql database oracle cx-oracle
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python objects in an array,"<p>I don't remember whether I was dreaming or not but I seem to recall there being a function which allowed something like,</p>\r\n\r\n<pre><code>foo in iter_attr(array of python objects, attribute name)</code></pre>\r\n\r\n<p>I've looked over the docs but this kind of thing doesn't fall under any obvious listed headers</p>",python arrays iteration


### Merging `answers` with `questions`

In [13]:
temp_ans = answers[['parentid', 'body']].copy()
temp_ans_rename = {
    'parentid': 'id',
    'body': 'answer_body'
}
temp_ans.rename(columns=temp_ans_rename,
                          inplace=True)

temp_ans = temp_ans[['id', 'answer_body']].groupby('id')['answer_body'].apply(' '.join).reset_index()
raw_df = pd.merge(left=df,
                  right=temp_ans,
                  on='id',
                  how='left')
raw_df.head()

Unnamed: 0,id,owneruserid,creationdate,score,title,body,tags,answer_body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from its display name on a Mac?,"<p>I am using the Photoshop's javascript API to find the fonts in a given PSD.</p>\n\n<p>Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc.</p>\n\n<p>This is all happening in a python program running on OSX so I guess I'm looking for one of:</p>\n\n<ul>\n<li>Some Photoshop javascript</li>\n<li>A Python function</li>\n<li>An OSX API that I can call from python</li>\n</ul>\n",python osx fonts photoshop,"<p>open up a terminal (Applications-&gt;Utilities-&gt;Terminal) and type this in:</p>\r\n\r\n<pre><code>locate InsertFontHere<br></code></pre>\r\n\r\n<p>This will spit out every file that has the name you want.</p>\r\n\r\n<p>Warning: there may be alot to wade through.</p> <p>I haven't been able to find anything that does this directly. I think you'll have to iterate through the various font folders on the system: <code>/System/Library/Fonts</code>, <code>/Library/Fonts</code>, and there can probably be a user-level directory as well <code>~/Library/Fonts</code>.</p>\n <p>Unfortunately the only API that isn't deprecated is located in the ApplicationServices framework, which doesn't have a bridge support file, and thus isn't available in the bridge. If you're wanting to use ctypes, you can use ATSFontGetFileReference after looking up the ATSFontRef.</p>\r\n\r\n<p>Cocoa doesn't have any native support, at least as of 10.5, for getting the location of a font.</p> <p>There must be a method in Cocoa to get a list of fonts, then you would have to use the PyObjC bindings to call it..</p>\n\n<p>Depending on what you need them for, you could probably just use something like the following..</p>\n\n<pre><code>import os\ndef get_font_list():\n fonts = []\n for font_path in [""/Library/Fonts"", os.path.expanduser(""~/Library/Fonts"")]:\n if os.path.isdir(font_path):\n fonts.extend(\n [os.path.join(font_path, cur_font) \n for cur_font in os.listdir(font_path)\n ]\n )\n return fonts\n</code></pre>\n"
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,"<p>I have a cross-platform (Python) application which needs to generate a JPEG preview of the first page of a PDF.</p>\n\n<p>On the Mac I am spawning <a href=""http://developer.apple.com/documentation/Darwin/Reference/ManPages/man1/sips.1.html"">sips</a>. Is there something similarly simple I can do on Windows?</p>\n",python windows image pdf,"<p>You can use ImageMagick's convert utility for this, see some examples in <a href=""https://web.archive.org/web/20120413111338/http://studio.imagemagick.org/pipermail/magick-users/2002-May/002636.html"" rel=""nofollow"">http://studio.imagemagick.org/pipermail/magick-users/2002-May/002636.html</a>\n:</p>\n\n<blockquote>\n<pre><code>Convert taxes.pdf taxes.jpg \n</code></pre>\n \n <p>Will convert a two page PDF file into [2] jpeg files: taxes.jpg.0,\n taxes.jpg.1</p>\n \n <p>I can also convert these JPEGS to a thumbnail as follows:</p>\n\n<pre><code>convert -size 120x120 taxes.jpg.0 -geometry 120x120 +profile '*' thumbnail.jpg\n</code></pre>\n \n <p>I can even convert the PDF directly to a jpeg thumbnail as follows:</p>\n\n<pre><code>convert -size 120x120 taxes.pdf -geometry 120x120 +profile '*' thumbnail.jpg\n</code></pre>\n \n <p>This will result in a thumbnail.jpg.0 and thumbnail.jpg.1 for the two\n pages.</p>\n</blockquote>\n <p>Is the PC likely to have Acrobat installed? I think Acrobat installs a shell extension so previews of the first page of a PDF document appear in Windows Explorer's thumbnail view. You can get thumbnails yourself via the IExtractImage COM API, which you'll need to wrap. <a href=""http://www.vbaccelerator.com/home/net/code/libraries/shell_projects/Thumbnail_Extraction/article.asp"" rel=""nofollow"" title=""Domain Specific Development with Visual Studio DSL Tools."">VBAccelerator has an example in C#</a> that you could port to Python.</p>\n <p>ImageMagick delegates the PDF->bitmap conversion to GhostScript anyway, so here's a command you can use (it's based on the actual command listed by the <code>ps:alpha</code> delegate in ImageMagick, just adjusted to use JPEG as output):</p>\n\n<pre><code>gs -q -dQUIET -dPARANOIDSAFER -dBATCH -dNOPAUSE -dNOPROMPT \\n-dMaxBitmap=500000000 -dLastPage=1 -dAlignToPixels=0 -dGridFitTT=0 \\n-sDEVICE=jpeg -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -r72x72 \\n-sOutputFile=$OUTPUT -f$INPUT\n</code></pre>\n\n<p>where <code>$OUTPUT</code> and <code>$INPUT</code> are the output and input filenames. Adjust the <code>72x72</code> to whatever resolution you need. (Obviously, strip out the backslashes if you're writing out the whole command as one line.)</p>\n\n<p>This is good for two reasons:</p>\n\n<ol>\n<li>You don't need to have ImageMagick installed anymore. Not that I have anything against ImageMagick (I love it to bits), but I believe in simple solutions.</li>\n<li>ImageMagick does a two-step conversion. First PDF->PPM, then PPM->JPEG. This way, the conversion is one-step.</li>\n</ol>\n\n<p>Other things to consider: with the files I've tested, PNG compresses better than JPEG. If you want to use PNG, change the <code>-sDEVICE=jpeg</code> to <code>-sDEVICE=png16m</code>.</p>\n"
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Codebase,"<p>I'm starting work on a hobby project with a python codebase and would like to set up some form of continuous integration (i.e. running a battery of test-cases each time a check-in is made and sending nag e-mails to responsible persons when the tests fail) similar to CruiseControl or TeamCity.</p>\n\n<p>I realize I could do this with hooks in most VCSes, but that requires that the tests run on the same machine as the version control server, which isn't as elegant as I would like. Does anyone have any suggestions for a small, user-friendly, open-source continuous integration system suitable for a Python codebase?</p>\n",python continuous-integration extreme-programming,"<p>One possibility is Hudson. It's written in Java, but there's integration with Python projects:</p>\n\n<blockquote>\n <p><a href=""http://redsolo.blogspot.com/2007/11/hudson-embraces-python.html"" rel=""nofollow"">Hudson embraces Python</a></p>\n</blockquote>\n\n<p>I've never tried it myself, however.</p>\n\n<p>(<strong>Update</strong>, Sept. 2011: After a trademark dispute Hudson has been renamed to <a href=""http://jenkins-ci.org/"" rel=""nofollow"">Jenkins</a>.)</p>\n <p>We run <a href=""http://buildbot.net/trac"">Buildbot - Trac</a> at work, I haven't used it too much since my code base isn't part of the release cycle yet. But we run the tests on different environments (OSX/Linux/Win) and it sends emails --and it's written in python.</p> <p>Second the Buildbot - Trac integration. You can find more information about the integration on the <a href=""http://buildbot.net/trac/wiki/BuildbotAndTrac"">Buildbot website</a>. At my previous job, we wrote and used the plugin they mention (tracbb).\r\nWhat the plugin does is rewriting all of the Buildbot urls so you can use Buildbot from within Trac. (http://example.com/tracbb).</p>\r\n\r\n<p>The really nice thing about Buildbot is that the configuration is written in Python. You can integrate your own Python code directly to the configuration. It's also very easy to write your own BuildSteps to execute specific tasks.</p>\r\n\r\n<p>We used BuildSteps to get the source from SVN, pull the dependencies, publish test results to WebDAV, etcetera.</p>\r\n\r\n<p>I wrote an X10 interface so we could send signals with build results. When the build failed, we switched on a red lava lamp. When the build succeeded, a green lava lamp switched on. Good times :-)</p> <p>We use both Buildbot and Hudson for Jython development. Both are useful, but have different strengths and weaknesses.</p>\n\n<p>Buildbot's configuration is pure Python and quite simple once you get the hang of it (look at the epydoc-generated API docs for the most current info). Buildbot makes it easier to define non-testing tasks¬†and distribute the testers. However, it really has no concept of individual tests, just textual, HTML, and summary output, so if you want to have multi-level browsable test output and so forth you'll have to build it yourself, or just use Hudson.</p>\n\n<p>Hudson has terrific support for drilling down from overall results into test suites and individual tests; it also is great for comparing test output between builds, but the distributed (master/slave) stuff is comparatively more complicated because you need a Java environment on the slaves too; also, Hudson is less tolerant of flaky network links between the master and slaves.</p>\n\n<p>So, to get the benefits of both tools, we run a single instance of Hudson, which catches the common test failures, then we do multi-platform regression with Buildbot.</p>\n\n<p>Here are our instances:</p>\n\n<ul>\n<li><a href=""http://bob.underboss.org:8080/job/jython/lastBuild/testReport/"">Jython Hudson</a></li>\n<li><a href=""http://www.acm.uiuc.edu/jython-buildbot/waterfall"">Jython buildbot</a></li>\n</ul>\n <p>We are using <a href=""http://bitten.edgewall.org/"" rel=""nofollow"">Bitten</a> wich is integrated with trac. And it's python based.</p>\n <p>TeamCity has some Python <a href=""http://www.jetbrains.net/confluence/display/TW/Python+Unit+Test+Reporting"" rel=""nofollow"">integration</a>.</p>\n\n<p>But TeamCity is:</p>\n\n<ul>\n<li>not open-source</li>\n<li>is not small, but rather feature rich</li>\n<li>is free for small-mid teams.</li>\n</ul>\n <p>I have very good experiences with <a href=""http://travis-ci.org/"" rel=""nofollow"">Travis-CI</a> for smaller code bases.\nThe main advantages are:</p>\n\n<ul>\n<li>setup is done in less than half a screen of config file</li>\n<li>you can do your own installation or just use the free hosted version</li>\n<li>semi-automatic setup for github repositories</li>\n<li>no account needed on website; login via github</li>\n</ul>\n\n<p>Some limitations:</p>\n\n<ul>\n<li><p>Python is not supported as a first class language (as of time of writing; but you can use pip and apt-get to install python dependencies; see <a href=""http://www.travisswicegood.com/2011/11/11/travis-and-python/"" rel=""nofollow"">this tutorial</a>)</p></li>\n<li><p>code has to be hosted on github (at least when using the official version)</p></li>\n</ul>\n"
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a result set. What are the tradeoff of each?</p>\n,python sql database oracle cx-oracle,"<p>The canonical way is to use the built-in cursor iterator.</p>\n\n<pre><code>curs.execute('select * from people')\nfor row in curs:\n print row\n</code></pre>\n\n<hr>\n\n<p>You can use <code>fetchall()</code> to get all rows at once.</p>\n\n<pre><code>for row in curs.fetchall():\n print row\n</code></pre>\n\n<p>It can be convenient to use this to create a Python list containing the values returned:</p>\n\n<pre><code>curs.execute('select first_name from people')\nnames = [row[0] for row in curs.fetchall()]\n</code></pre>\n\n<p>This can be useful for smaller result sets, but can have bad side effects if the result set is large.</p>\n\n<ul>\n<li><p>You have to wait for the entire result set to be returned to\nyour client process.</p></li>\n<li><p>You may eat up a lot of memory in your client to hold\nthe built-up list.</p></li>\n<li><p>It may take a while for Python to construct and deconstruct the\nlist which you are going to immediately discard anyways.</p></li>\n</ul>\n\n<hr>\n\n<p>If you know there's a single row being returned in the result set you can call <code>fetchone()</code> to get the single row.</p>\n\n<pre><code>curs.execute('select max(x) from t')\nmaxValue = curs.fetchone()[0]\n</code></pre>\n\n<hr>\n\n<p>Finally, you can loop over the result set fetching one row at a time. In general, there's no particular advantage in doing this over using the iterator.</p>\n\n<pre><code>row = curs.fetchone()\nwhile row:\n print row\n row = curs.fetchone()\n</code></pre>\n <p>There's also the way <code>psyco-pg</code> seems to do it... From what I gather, it seems to create dictionary-like row-proxies to map key lookup into the memory block returned by the query. In that case, fetching the whole answer and working with a similar proxy-factory over the rows seems like useful idea. Come to think of it though, it feels more like Lua than Python.</p>\n\n<p>Also, this should be applicable to all <a href=""http://www.python.org/dev/peps/pep-0249/"" rel=""nofollow"">PEP-249 DBAPI2.0</a> interfaces, not just Oracle, or did you mean just <em>fastest</em> using <em>Oracle</em>?</p>\n <p>My preferred way is the cursor iterator, but setting first the arraysize property of the cursor. </p>\n\n<pre><code>curs.execute('select * from people')\ncurs.arraysize = 256\nfor row in curs:\n print row\n</code></pre>\n\n<p>In this example, cx_Oracle will fetch rows from Oracle 256 rows at a time, reducing the number of network round trips that need to be performed</p>\n"
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python objects in an array,"<p>I don't remember whether I was dreaming or not but I seem to recall there being a function which allowed something like,</p>\r\n\r\n<pre><code>foo in iter_attr(array of python objects, attribute name)</code></pre>\r\n\r\n<p>I've looked over the docs but this kind of thing doesn't fall under any obvious listed headers</p>",python arrays iteration,"<p>No, you were not dreaming. Python has a pretty excellent list comprehension system that lets you manipulate lists pretty elegantly, and depending on exactly what you want to accomplish, this can be done a couple of ways. In essence, what you're doing is saying ""For item in list if criteria.matches"", and from that you can just iterate through the results or dump the results into a new list.</p>\n\n<p>I'm going to crib an example from <a href=""http://diveintopython.net/functional_programming/filtering_lists.html"" rel=""nofollow"">Dive Into Python</a> here, because it's pretty elegant and they're smarter than I am. Here they're getting a list of files in a directory, then filtering the list for all files that match a regular expression criteria.</p>\n\n<blockquote>\n<pre><code> files = os.listdir(path) \n test = re.compile(""test\.py$"", re.IGNORECASE) \n files = [f for f in files if test.search(f)]\n</code></pre>\n</blockquote>\n\n<p>You could do this without regular expressions, for your example, for anything where your expression at the end returns true for a match. There are other options like using the filter() function, but if I were going to choose, I'd go with this.</p>\n\n<p>Eric Sipple</p>\n <p>I think:</p>\r\n\r\n<pre><code>#!/bin/python<br>bar in dict(Foo)<br></code></pre>\r\n\r\n<p>Is what you are thinking of. When trying to see if a certain key exists within a dictionary in python (python's version of a hash table) there are two ways to check. First is the <strong><code>has_key()</code></strong> method attached to the dictionary and second is the example given above. It will return a boolean value.</p>\r\n\r\n<p>That should answer your question.</p>\r\n\r\n<p>And now a little off topic to tie this in to the <em>list comprehension</em> answer previously given (for a bit more clarity). <em>List Comprehensions</em> construct a list from a basic <em>for loop</em> with modifiers. As an example (to clarify slightly), a way to use the <code>in dict</code> language construct in a _list comprehension_:</p>\r\n\r\n<p>Say you have a two dimensional dictionary <strong><code>foo</code></strong> and you only want the second dimension dictionaries which contain the key <strong><code>bar</code></strong>. A relatively straightforward way to do so would be to use a <em>list comprehension</em> with a conditional as follows:</p>\r\n\r\n<pre><code>#!/bin/python<br>baz = dict([(key, value) for key, value in foo if bar in value])<br></code></pre>\r\n\r\n<p>Note the <strong><code>if bar in value</code></strong> at the end of the statement<strong>, this is a modifying clause which tells the <em>list comprehension</em> to only keep those key-value pairs which meet the conditional.</strong> In this case <strong><code>baz</code></strong> is a new dictionary which contains only the dictionaries from foo which contain bar (Hopefully I didn't miss anything in that code example... you may have to take a look at the list comprehension documentation found in <a href=""http://docs.python.org/tut/node7.html#SECTION007140000000000000000"" rel=""nofollow"">docs.python.org tutorials</a> and at <a href=""http://www.secnetix.de/olli/Python/list_comprehensions.hawk"" rel=""nofollow"">secnetix.de</a>, both sites are good references if you have questions in the future.).</p> <p>Are you looking to get a list of objects that have a certain attribute? If so, a <a href=""http://docs.python.org/tut/node7.html#SECTION007140000000000000000"">list comprehension</a> is the right way to do this.</p>\r\n\r\n<pre><code>result = [obj for obj in listOfObjs if hasattr(obj, 'attributeName')]<br></code></pre> <p>What I was thinking of can be achieved using list comprehensions, but I thought that there was a function that did this in a slightly neater way.</p>\r\n\r\n<p>i.e. 'bar' is a list of objects, all of which have the attribute 'id'</p>\r\n\r\n<p>The mythical functional way:</p>\r\n\r\n<pre><code>foo = 12<br>foo in iter_attr(bar, 'id')</code></pre>\r\n\r\n<p>The list comprehension way:</p>\r\n\r\n<pre><code>foo = 12<br>foo in [obj.id for obj in bar]</code></pre>\r\n\r\n<p>In retrospect the list comprehension way is pretty neat anyway.</p> <p>you could always write one yourself:</p>\n\n<pre><code>def iterattr(iterator, attributename):\n for obj in iterator:\n yield getattr(obj, attributename)\n</code></pre>\n\n<p>will work with anything that iterates, be it a tuple, list, or whatever.</p>\n\n<p>I love python, it makes stuff like this very simple and no more of a hassle than neccessary, and in use stuff like this is hugely elegant.</p>\n <p>If you plan on searching anything of remotely decent size, your best bet is going to be to use a dictionary or a set. Otherwise, you basically have to iterate through every element of the iterator until you get to the one you want.</p>\n\n<p>If this isn't necessarily performance sensitive code, then the list comprehension way should work. But note that it is fairly inefficient because it goes over every element of the iterator and then goes BACK over it again until it finds what it wants.</p>\n\n<p>Remember, python has one of the most efficient hashing algorithms around. Use it to your advantage.</p>\n <p>Using a list comprehension would build a temporary list, which could eat all your memory if the sequence being searched is large. Even if the sequence is not large, building the list means iterating over the whole of the sequence before <code>in</code> could start its search.</p>\n\n<p>The temporary list can be avoiding by using a generator expression:</p>\n\n<pre><code>foo = 12\nfoo in (obj.id for obj in bar)\n</code></pre>\n\n<p>Now, as long as <code>obj.id == 12</code> near the start of <code>bar</code>, the search will be fast, even if <code>bar</code> is infinitely long.</p>\n\n<p>As @Matt suggested, it's a good idea to use <code>hasattr</code> if any of the objects in <code>bar</code> can be missing an <code>id</code> attribute:</p>\n\n<pre><code>foo = 12\nfoo in (obj.id for obj in bar if hasattr(obj, 'id'))\n</code></pre>\n <p>The function you are thinking of is probably <code>operator.attrgettter</code>. For example, to get a list that contains the value of each object's ""id"" attribute:</p>\n\n<pre><code>import operator\nids = map(operator.attrgetter(""id""), bar)</code></pre>\n\n<p>If you want to check whether the list contains an object with an id == 12, then a neat and efficient (i.e. doesn't iterate the whole list unnecessarily) way to do it is:</p>\n\n<pre><code>any(obj.id == 12 for obj in bar)</code></pre>\n\n<p>If you want to use 'in' with attrgetter, while still retaining lazy iteration of the list:</p>\n\n<p><pre><code>import operator,itertools\nfoo = 12\nfoo in itertools.imap(operator.attrgetter(""id""), bar)\n</pre></code></p>\n"


### Extract text from question body

In [92]:
raw_df['clean_q_body'] = raw_df['body'].apply(lambda x: clean_html_body(x))
raw_df.head(5)

Unnamed: 0,id,owneruserid,creationdate,score,title,body,tags,answer_body,clean_q_body
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from its display name on a Mac?,"<p>I am using the Photoshop's javascript API to find the fonts in a given PSD.</p>\n\n<p>Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc.</p>\n\n<p>This is all happening in a python program running on OSX so I guess I'm looking for one of:</p>\n\n<ul>\n<li>Some Photoshop javascript</li>\n<li>A Python function</li>\n<li>An OSX API that I can call from python</li>\n</ul>\n",python osx fonts photoshop,"<p>open up a terminal (Applications-&gt;Utilities-&gt;Terminal) and type this in:</p>\r\n\r\n<pre><code>locate InsertFontHere<br></code></pre>\r\n\r\n<p>This will spit out every file that has the name you want.</p>\r\n\r\n<p>Warning: there may be alot to wade through.</p> <p>I haven't been able to find anything that does this directly. I think you'll have to iterate through the various font folders on the system: <code>/System/Library/Fonts</code>, <code>/Library/Fonts</code>, and there can probably be a user-level directory as well <code>~/Library/Fonts</code>.</p>\n <p>Unfortunately the only API that isn't deprecated is located in the ApplicationServices framework, which doesn't have a bridge support file, and thus isn't available in the bridge. If you're wanting to use ctypes, you can use ATSFontGetFileReference after looking up the ATSFontRef.</p>\r\n\r\n<p>Cocoa doesn't have any native support, at least as of 10.5, for getting the location of a font.</p> <p>There must be a method in Cocoa to get a list of fonts, then you would have to use the PyObjC bindings to call it..</p>\n\n<p>Depending on what you need them for, you could probably just use something like the following..</p>\n\n<pre><code>import os\ndef get_font_list():\n fonts = []\n for font_path in [""/Library/Fonts"", os.path.expanduser(""~/Library/Fonts"")]:\n if os.path.isdir(font_path):\n fonts.extend(\n [os.path.join(font_path, cur_font) \n for cur_font in os.listdir(font_path)\n ]\n )\n return fonts\n</code></pre>\n","I am using the Photoshop's javascript API to find the fonts in a given PSD. Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc. This is all happening in a python program running on OSX so I guess I'm looking for one of: Some Photoshop javascript A Python function An OSX API that I can call from python"
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,"<p>I have a cross-platform (Python) application which needs to generate a JPEG preview of the first page of a PDF.</p>\n\n<p>On the Mac I am spawning <a href=""http://developer.apple.com/documentation/Darwin/Reference/ManPages/man1/sips.1.html"">sips</a>. Is there something similarly simple I can do on Windows?</p>\n",python windows image pdf,"<p>You can use ImageMagick's convert utility for this, see some examples in <a href=""https://web.archive.org/web/20120413111338/http://studio.imagemagick.org/pipermail/magick-users/2002-May/002636.html"" rel=""nofollow"">http://studio.imagemagick.org/pipermail/magick-users/2002-May/002636.html</a>\n:</p>\n\n<blockquote>\n<pre><code>Convert taxes.pdf taxes.jpg \n</code></pre>\n \n <p>Will convert a two page PDF file into [2] jpeg files: taxes.jpg.0,\n taxes.jpg.1</p>\n \n <p>I can also convert these JPEGS to a thumbnail as follows:</p>\n\n<pre><code>convert -size 120x120 taxes.jpg.0 -geometry 120x120 +profile '*' thumbnail.jpg\n</code></pre>\n \n <p>I can even convert the PDF directly to a jpeg thumbnail as follows:</p>\n\n<pre><code>convert -size 120x120 taxes.pdf -geometry 120x120 +profile '*' thumbnail.jpg\n</code></pre>\n \n <p>This will result in a thumbnail.jpg.0 and thumbnail.jpg.1 for the two\n pages.</p>\n</blockquote>\n <p>Is the PC likely to have Acrobat installed? I think Acrobat installs a shell extension so previews of the first page of a PDF document appear in Windows Explorer's thumbnail view. You can get thumbnails yourself via the IExtractImage COM API, which you'll need to wrap. <a href=""http://www.vbaccelerator.com/home/net/code/libraries/shell_projects/Thumbnail_Extraction/article.asp"" rel=""nofollow"" title=""Domain Specific Development with Visual Studio DSL Tools."">VBAccelerator has an example in C#</a> that you could port to Python.</p>\n <p>ImageMagick delegates the PDF->bitmap conversion to GhostScript anyway, so here's a command you can use (it's based on the actual command listed by the <code>ps:alpha</code> delegate in ImageMagick, just adjusted to use JPEG as output):</p>\n\n<pre><code>gs -q -dQUIET -dPARANOIDSAFER -dBATCH -dNOPAUSE -dNOPROMPT \\n-dMaxBitmap=500000000 -dLastPage=1 -dAlignToPixels=0 -dGridFitTT=0 \\n-sDEVICE=jpeg -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -r72x72 \\n-sOutputFile=$OUTPUT -f$INPUT\n</code></pre>\n\n<p>where <code>$OUTPUT</code> and <code>$INPUT</code> are the output and input filenames. Adjust the <code>72x72</code> to whatever resolution you need. (Obviously, strip out the backslashes if you're writing out the whole command as one line.)</p>\n\n<p>This is good for two reasons:</p>\n\n<ol>\n<li>You don't need to have ImageMagick installed anymore. Not that I have anything against ImageMagick (I love it to bits), but I believe in simple solutions.</li>\n<li>ImageMagick does a two-step conversion. First PDF->PPM, then PPM->JPEG. This way, the conversion is one-step.</li>\n</ol>\n\n<p>Other things to consider: with the files I've tested, PNG compresses better than JPEG. If you want to use PNG, change the <code>-sDEVICE=jpeg</code> to <code>-sDEVICE=png16m</code>.</p>\n",I have a cross-platform (Python) application which needs to generate a JPEG preview of the first page of a PDF. On the Mac I am spawning llalal. Is there something similarly simple I can do on Windows?
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Codebase,"<p>I'm starting work on a hobby project with a python codebase and would like to set up some form of continuous integration (i.e. running a battery of test-cases each time a check-in is made and sending nag e-mails to responsible persons when the tests fail) similar to CruiseControl or TeamCity.</p>\n\n<p>I realize I could do this with hooks in most VCSes, but that requires that the tests run on the same machine as the version control server, which isn't as elegant as I would like. Does anyone have any suggestions for a small, user-friendly, open-source continuous integration system suitable for a Python codebase?</p>\n",python continuous-integration extreme-programming,"<p>One possibility is Hudson. It's written in Java, but there's integration with Python projects:</p>\n\n<blockquote>\n <p><a href=""http://redsolo.blogspot.com/2007/11/hudson-embraces-python.html"" rel=""nofollow"">Hudson embraces Python</a></p>\n</blockquote>\n\n<p>I've never tried it myself, however.</p>\n\n<p>(<strong>Update</strong>, Sept. 2011: After a trademark dispute Hudson has been renamed to <a href=""http://jenkins-ci.org/"" rel=""nofollow"">Jenkins</a>.)</p>\n <p>We run <a href=""http://buildbot.net/trac"">Buildbot - Trac</a> at work, I haven't used it too much since my code base isn't part of the release cycle yet. But we run the tests on different environments (OSX/Linux/Win) and it sends emails --and it's written in python.</p> <p>Second the Buildbot - Trac integration. You can find more information about the integration on the <a href=""http://buildbot.net/trac/wiki/BuildbotAndTrac"">Buildbot website</a>. At my previous job, we wrote and used the plugin they mention (tracbb).\r\nWhat the plugin does is rewriting all of the Buildbot urls so you can use Buildbot from within Trac. (http://example.com/tracbb).</p>\r\n\r\n<p>The really nice thing about Buildbot is that the configuration is written in Python. You can integrate your own Python code directly to the configuration. It's also very easy to write your own BuildSteps to execute specific tasks.</p>\r\n\r\n<p>We used BuildSteps to get the source from SVN, pull the dependencies, publish test results to WebDAV, etcetera.</p>\r\n\r\n<p>I wrote an X10 interface so we could send signals with build results. When the build failed, we switched on a red lava lamp. When the build succeeded, a green lava lamp switched on. Good times :-)</p> <p>We use both Buildbot and Hudson for Jython development. Both are useful, but have different strengths and weaknesses.</p>\n\n<p>Buildbot's configuration is pure Python and quite simple once you get the hang of it (look at the epydoc-generated API docs for the most current info). Buildbot makes it easier to define non-testing tasks¬†and distribute the testers. However, it really has no concept of individual tests, just textual, HTML, and summary output, so if you want to have multi-level browsable test output and so forth you'll have to build it yourself, or just use Hudson.</p>\n\n<p>Hudson has terrific support for drilling down from overall results into test suites and individual tests; it also is great for comparing test output between builds, but the distributed (master/slave) stuff is comparatively more complicated because you need a Java environment on the slaves too; also, Hudson is less tolerant of flaky network links between the master and slaves.</p>\n\n<p>So, to get the benefits of both tools, we run a single instance of Hudson, which catches the common test failures, then we do multi-platform regression with Buildbot.</p>\n\n<p>Here are our instances:</p>\n\n<ul>\n<li><a href=""http://bob.underboss.org:8080/job/jython/lastBuild/testReport/"">Jython Hudson</a></li>\n<li><a href=""http://www.acm.uiuc.edu/jython-buildbot/waterfall"">Jython buildbot</a></li>\n</ul>\n <p>We are using <a href=""http://bitten.edgewall.org/"" rel=""nofollow"">Bitten</a> wich is integrated with trac. And it's python based.</p>\n <p>TeamCity has some Python <a href=""http://www.jetbrains.net/confluence/display/TW/Python+Unit+Test+Reporting"" rel=""nofollow"">integration</a>.</p>\n\n<p>But TeamCity is:</p>\n\n<ul>\n<li>not open-source</li>\n<li>is not small, but rather feature rich</li>\n<li>is free for small-mid teams.</li>\n</ul>\n <p>I have very good experiences with <a href=""http://travis-ci.org/"" rel=""nofollow"">Travis-CI</a> for smaller code bases.\nThe main advantages are:</p>\n\n<ul>\n<li>setup is done in less than half a screen of config file</li>\n<li>you can do your own installation or just use the free hosted version</li>\n<li>semi-automatic setup for github repositories</li>\n<li>no account needed on website; login via github</li>\n</ul>\n\n<p>Some limitations:</p>\n\n<ul>\n<li><p>Python is not supported as a first class language (as of time of writing; but you can use pip and apt-get to install python dependencies; see <a href=""http://www.travisswicegood.com/2011/11/11/travis-and-python/"" rel=""nofollow"">this tutorial</a>)</p></li>\n<li><p>code has to be hosted on github (at least when using the official version)</p></li>\n</ul>\n","I'm starting work on a hobby project with a python codebase and would like to set up some form of continuous integration (i.e. running a battery of test-cases each time a check-in is made and sending nag e-mails to responsible persons when the tests fail) similar to CruiseControl or TeamCity. I realize I could do this with hooks in most VCSes, but that requires that the tests run on the same machine as the version control server, which isn't as elegant as I would like. Does anyone have any suggestions for a small, user-friendly, open-source continuous integration system suitable for a Python codebase?"
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a result set. What are the tradeoff of each?</p>\n,python sql database oracle cx-oracle,"<p>The canonical way is to use the built-in cursor iterator.</p>\n\n<pre><code>curs.execute('select * from people')\nfor row in curs:\n print row\n</code></pre>\n\n<hr>\n\n<p>You can use <code>fetchall()</code> to get all rows at once.</p>\n\n<pre><code>for row in curs.fetchall():\n print row\n</code></pre>\n\n<p>It can be convenient to use this to create a Python list containing the values returned:</p>\n\n<pre><code>curs.execute('select first_name from people')\nnames = [row[0] for row in curs.fetchall()]\n</code></pre>\n\n<p>This can be useful for smaller result sets, but can have bad side effects if the result set is large.</p>\n\n<ul>\n<li><p>You have to wait for the entire result set to be returned to\nyour client process.</p></li>\n<li><p>You may eat up a lot of memory in your client to hold\nthe built-up list.</p></li>\n<li><p>It may take a while for Python to construct and deconstruct the\nlist which you are going to immediately discard anyways.</p></li>\n</ul>\n\n<hr>\n\n<p>If you know there's a single row being returned in the result set you can call <code>fetchone()</code> to get the single row.</p>\n\n<pre><code>curs.execute('select max(x) from t')\nmaxValue = curs.fetchone()[0]\n</code></pre>\n\n<hr>\n\n<p>Finally, you can loop over the result set fetching one row at a time. In general, there's no particular advantage in doing this over using the iterator.</p>\n\n<pre><code>row = curs.fetchone()\nwhile row:\n print row\n row = curs.fetchone()\n</code></pre>\n <p>There's also the way <code>psyco-pg</code> seems to do it... From what I gather, it seems to create dictionary-like row-proxies to map key lookup into the memory block returned by the query. In that case, fetching the whole answer and working with a similar proxy-factory over the rows seems like useful idea. Come to think of it though, it feels more like Lua than Python.</p>\n\n<p>Also, this should be applicable to all <a href=""http://www.python.org/dev/peps/pep-0249/"" rel=""nofollow"">PEP-249 DBAPI2.0</a> interfaces, not just Oracle, or did you mean just <em>fastest</em> using <em>Oracle</em>?</p>\n <p>My preferred way is the cursor iterator, but setting first the arraysize property of the cursor. </p>\n\n<pre><code>curs.execute('select * from people')\ncurs.arraysize = 256\nfor row in curs:\n print row\n</code></pre>\n\n<p>In this example, cx_Oracle will fetch rows from Oracle 256 rows at a time, reducing the number of network round trips that need to be performed</p>\n",There are several ways to iterate over a result set. What are the tradeoff of each?
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python objects in an array,"<p>I don't remember whether I was dreaming or not but I seem to recall there being a function which allowed something like,</p>\r\n\r\n<pre><code>foo in iter_attr(array of python objects, attribute name)</code></pre>\r\n\r\n<p>I've looked over the docs but this kind of thing doesn't fall under any obvious listed headers</p>",python arrays iteration,"<p>No, you were not dreaming. Python has a pretty excellent list comprehension system that lets you manipulate lists pretty elegantly, and depending on exactly what you want to accomplish, this can be done a couple of ways. In essence, what you're doing is saying ""For item in list if criteria.matches"", and from that you can just iterate through the results or dump the results into a new list.</p>\n\n<p>I'm going to crib an example from <a href=""http://diveintopython.net/functional_programming/filtering_lists.html"" rel=""nofollow"">Dive Into Python</a> here, because it's pretty elegant and they're smarter than I am. Here they're getting a list of files in a directory, then filtering the list for all files that match a regular expression criteria.</p>\n\n<blockquote>\n<pre><code> files = os.listdir(path) \n test = re.compile(""test\.py$"", re.IGNORECASE) \n files = [f for f in files if test.search(f)]\n</code></pre>\n</blockquote>\n\n<p>You could do this without regular expressions, for your example, for anything where your expression at the end returns true for a match. There are other options like using the filter() function, but if I were going to choose, I'd go with this.</p>\n\n<p>Eric Sipple</p>\n <p>I think:</p>\r\n\r\n<pre><code>#!/bin/python<br>bar in dict(Foo)<br></code></pre>\r\n\r\n<p>Is what you are thinking of. When trying to see if a certain key exists within a dictionary in python (python's version of a hash table) there are two ways to check. First is the <strong><code>has_key()</code></strong> method attached to the dictionary and second is the example given above. It will return a boolean value.</p>\r\n\r\n<p>That should answer your question.</p>\r\n\r\n<p>And now a little off topic to tie this in to the <em>list comprehension</em> answer previously given (for a bit more clarity). <em>List Comprehensions</em> construct a list from a basic <em>for loop</em> with modifiers. As an example (to clarify slightly), a way to use the <code>in dict</code> language construct in a _list comprehension_:</p>\r\n\r\n<p>Say you have a two dimensional dictionary <strong><code>foo</code></strong> and you only want the second dimension dictionaries which contain the key <strong><code>bar</code></strong>. A relatively straightforward way to do so would be to use a <em>list comprehension</em> with a conditional as follows:</p>\r\n\r\n<pre><code>#!/bin/python<br>baz = dict([(key, value) for key, value in foo if bar in value])<br></code></pre>\r\n\r\n<p>Note the <strong><code>if bar in value</code></strong> at the end of the statement<strong>, this is a modifying clause which tells the <em>list comprehension</em> to only keep those key-value pairs which meet the conditional.</strong> In this case <strong><code>baz</code></strong> is a new dictionary which contains only the dictionaries from foo which contain bar (Hopefully I didn't miss anything in that code example... you may have to take a look at the list comprehension documentation found in <a href=""http://docs.python.org/tut/node7.html#SECTION007140000000000000000"" rel=""nofollow"">docs.python.org tutorials</a> and at <a href=""http://www.secnetix.de/olli/Python/list_comprehensions.hawk"" rel=""nofollow"">secnetix.de</a>, both sites are good references if you have questions in the future.).</p> <p>Are you looking to get a list of objects that have a certain attribute? If so, a <a href=""http://docs.python.org/tut/node7.html#SECTION007140000000000000000"">list comprehension</a> is the right way to do this.</p>\r\n\r\n<pre><code>result = [obj for obj in listOfObjs if hasattr(obj, 'attributeName')]<br></code></pre> <p>What I was thinking of can be achieved using list comprehensions, but I thought that there was a function that did this in a slightly neater way.</p>\r\n\r\n<p>i.e. 'bar' is a list of objects, all of which have the attribute 'id'</p>\r\n\r\n<p>The mythical functional way:</p>\r\n\r\n<pre><code>foo = 12<br>foo in iter_attr(bar, 'id')</code></pre>\r\n\r\n<p>The list comprehension way:</p>\r\n\r\n<pre><code>foo = 12<br>foo in [obj.id for obj in bar]</code></pre>\r\n\r\n<p>In retrospect the list comprehension way is pretty neat anyway.</p> <p>you could always write one yourself:</p>\n\n<pre><code>def iterattr(iterator, attributename):\n for obj in iterator:\n yield getattr(obj, attributename)\n</code></pre>\n\n<p>will work with anything that iterates, be it a tuple, list, or whatever.</p>\n\n<p>I love python, it makes stuff like this very simple and no more of a hassle than neccessary, and in use stuff like this is hugely elegant.</p>\n <p>If you plan on searching anything of remotely decent size, your best bet is going to be to use a dictionary or a set. Otherwise, you basically have to iterate through every element of the iterator until you get to the one you want.</p>\n\n<p>If this isn't necessarily performance sensitive code, then the list comprehension way should work. But note that it is fairly inefficient because it goes over every element of the iterator and then goes BACK over it again until it finds what it wants.</p>\n\n<p>Remember, python has one of the most efficient hashing algorithms around. Use it to your advantage.</p>\n <p>Using a list comprehension would build a temporary list, which could eat all your memory if the sequence being searched is large. Even if the sequence is not large, building the list means iterating over the whole of the sequence before <code>in</code> could start its search.</p>\n\n<p>The temporary list can be avoiding by using a generator expression:</p>\n\n<pre><code>foo = 12\nfoo in (obj.id for obj in bar)\n</code></pre>\n\n<p>Now, as long as <code>obj.id == 12</code> near the start of <code>bar</code>, the search will be fast, even if <code>bar</code> is infinitely long.</p>\n\n<p>As @Matt suggested, it's a good idea to use <code>hasattr</code> if any of the objects in <code>bar</code> can be missing an <code>id</code> attribute:</p>\n\n<pre><code>foo = 12\nfoo in (obj.id for obj in bar if hasattr(obj, 'id'))\n</code></pre>\n <p>The function you are thinking of is probably <code>operator.attrgettter</code>. For example, to get a list that contains the value of each object's ""id"" attribute:</p>\n\n<pre><code>import operator\nids = map(operator.attrgetter(""id""), bar)</code></pre>\n\n<p>If you want to check whether the list contains an object with an id == 12, then a neat and efficient (i.e. doesn't iterate the whole list unnecessarily) way to do it is:</p>\n\n<pre><code>any(obj.id == 12 for obj in bar)</code></pre>\n\n<p>If you want to use 'in' with attrgetter, while still retaining lazy iteration of the list:</p>\n\n<p><pre><code>import operator,itertools\nfoo = 12\nfoo in itertools.imap(operator.attrgetter(""id""), bar)\n</pre></code></p>\n","I don't remember whether I was dreaming or not but I seem to recall there being a function which allowed something like,\r \r lallaa\r \r I've looked over the docs but this kind of thing doesn't fall under any obvious listed headers"


In [96]:
raw_df.to_csv('Data/raw_data_with_answer_body_only.csv',
             index=False,
             encoding='utf-8')

UnicodeEncodeError: 'utf-8' codec can't encode characters in position 4266-4279: surrogates not allowed

# Test

In [14]:
df[['body']].sample()

Unnamed: 0,body
537283,"<p>I have a huge set of data which has several columns and about 10k rows in more than 100 csv files, for now I am concerned about only one column with message format and from them I want to extract two parameters. I searched extensively around and I found two solutions that seem close but are not enough close to solve the question here. <a href=""http://stackoverflow.com/questions/33735272/nsregularexpression-for-hashtags-and-mentions-with-special-characters"">ONE</a> &amp; <a href=""https://regex101.com/r/dT4mK6/1#python"" rel=""nofollow"">TWO</a></p>\n\n<p>Input : Col name <code>""Text""</code> and every message is a separate row in a csv. </p>\n\n<pre><code>""Let's Bounce!√∞≈∏Àú‚Ä∞ #[message_1]\n\n Loving the energy &amp;amp; Microphonic Mayhem while√¢‚Ç¨¬¶"" #[message_2]\n\nRT @IVijayboi: #[message_3] @Bdutt@sardesairajdeep@rahulkanwal@abhisarsharma@ppbajpayi@Abpnewd@Ndtv@Aajtak#Jihadimedia@Ibn7 happy #PresstitutesDay\n\n ""RT @RakeshKhatri23: MY LIFE #[message_4]\n\n WITHOUT YOU \n\n IS\n\n LIKE \n\n FLOWERS WITHOUT \n\n FRAGRANCE √∞≈∏‚Äô≈æ√∞≈∏‚Äô≈æ\n\n ~True Love~""\n\n\n Me &amp;amp; my baby √∞≈∏¬∂√¢¬§√Ø¬∏√∞≈∏‚Äò¬≠ @ Home Sweet Home #[message_5]\n</code></pre>\n\n<p>The input is a CSV file with several other columns in the data but I am interested only in this column. I want to separate the <code>@name</code> and <code>#keyword</code>from the input into a new column like: </p>\n\n<p>expected output </p>\n\n<pre><code>text, mentions, keywords \n[message], NAN, NAN\n[message], NAN, NAN\n[message], @IVijayboi, #Jihadimedia \n @Bdutt #PresstitutesDay\n @sardesairajdeep \n @rahulkanwal \n @abhisarsharma \n @ppbajpayi \n @Abpnewd \n @Ndtv \n @Aajtak \n @Ibn7\n</code></pre>\n\n<p>As we see in the input first and second message has no <code>@</code> and <code>#</code> so the column values <code>NAN</code> but for the third message it has 10 <code>@</code> and 2 <code>#</code> keywords. </p>\n\n<p>In simple words how do I separate the @ mentioned names and # keywords from the message to a separate column. </p>\n"


In [15]:
test = """<p>I need your advice and help for my code.</p>\n\n<p>I am making a GUI program. Basically the program does the followings:</p>\n\n<ol>\n<li>Gets input from the user</li>\n<li>When a certain button is pushed, the program retrieves all the input and saved them in the database</li>\n<li>The program does computations.</li>\n<li>Shows the output</li>\n</ol>\n\n<p>Here is the simple version of the program, calculator program:\n<a href="http://i.stack.imgur.com/E7y5r.png" rel="nofollow"><img src="http://i.stack.imgur.com/E7y5r.png" alt="simple GUI program, calculator"></a></p>\n\n<p>As you can see in the figure, it takes 4 input from user, (1) any integer for var1, (2) any integer for var2, (3) operator (addition or subtraction) and (4) click button.</p>\n\n<p>The GUI was designed using QtDesigner.</p>\n\n<p>And here is my code:</p>\n\n<pre><code>import sys\nimport sqlite3\nfrom PyQt4 import QtCore, QtGui\nfrom testgui import Ui_MainWindow\n\nclass ShowAndInput(QtGui.QMainWindow):\n def __init__(self, parent=None):\n QtGui.QWidget.__init__(self, parent)\n self.ui = Ui_MainWindow()\n self.ui.setupUi(self)\n\n #connect to the database\n self.connect_to_db()\n\n #create table in the database\n self.create_table()\n\n #add items to the comboBox\n self.add_items()\n\n #calculate button to start calculating, gives signal to the next class Calculate\n self.ui.pushButton_3.clicked.connect(self.calculate)\n\n #close button to close the window\n QtCore.QObject.connect(self.ui.pushButton,QtCore.SIGNAL("clicked()"), quit)\n\n self.conn.close()\n\n def connect_to_db(self):\n self.conn = sqlite3.connect('testguidb.sqlite3')\n self.cur = self.conn.cursor()\n\n def create_table(self):\n self.cur.execute('''CREATE TABLE IF NOT EXISTS Input (var1 INTEGER, var2 INTEGER)''')\n self.conn.commit()\n\n def add_items(self):\n lst = ['addition', 'subtraction']\n self.ui.comboBox.clear()\n self.ui.comboBox.addItems(lst)\n\n def calculate(self):\n calculate = Calculate(self)\n calculate.exec_()\n</code></pre>\n\n<p>So the purpose of the first class was to show the GUI and take input from the user. If user clicks the pushbutton named 'Calculate', it should connect to the new class named 'Calculate'.</p>\n\n<p>Here is my code for the Calculate class:</p>\n\n<pre><code>class Calculate(QtGui.QMainWindow):\n def __init__(self, parent = None):\n QtGui.QWidget.__init__(parent)\n self.ui = Ui_MainWindow()\n self.ui.setupUi(self)\n\n #validator to validate that the input is integer or digit\n\n #connect to database\n self.connect_to_db()\n\n #get the input values and store them in the database\n self.get_the_values()\n\n #print the output\n PrintOutcome()\n\n def connect_to_db(self):\n self.conn = sqlite3.connect('testguidb.sqlite3')\n self.cur = self.conn.cursor()\n\n def get_the_values(self):\n dicti = {}\n dicti['var1'] = self.ui.lineEdit.text()\n dicti['var2'] = self.ui.lineEdit_2.text()\n for key, val in sorted(dicti.items()):\n key = str(key)\n val = int(val)\n self.cur.execute('INSERT OR IGNORE INTO Input (?) VALUES (?)',(key,val))\n self.conn.commit()\n</code></pre>\n\n<p>And at the end of this class, it calls another class named 'PrintOutcome' to print out the result in the QTextBrowser. Here is the last piece of the code:</p>\n\n<pre><code>class PrintOutcome(QtGui.QMainWindow):\n def __init__(self, parent = None):\n QtGui.QWidget.__init__(parent)\n self.ui = Ui_MainWindow()\n self.ui.setupUi(self)\n\n #connect to database\n self.connect_to_db()\n\n #get the value of the combobox\n self.get_combox() \n\n #print the output\n if self.operator == 'addition': self.print_add()\n elif self.operator == 'subtraction': self.print_sub()\n\n def connect_to_db(self):\n self.conn = sqlite3.connect('testguidb.sqlite3')\n self.cur = self.conn.cursor()\n\n def get_combox(self):\n self.operator = str(self.ui.comboBox.currentText())\n\n def print_add(self):\n dicti={}\n for i in range(2):\n i = i + 1\n dicti['var{0}'.format(i)] = self.cur.execute('SELECT var{0} FROM Input'.format(i))\n dicti['var{0}'.format(i)] = int(self.cur.fetchone()[0])\n text = str(dicti['var{0}'.format(i)])\n self.ui.textBrowser.append(text)\n result = dicti['var1'] + dicti['var2']\n textsep = '-'*25+'(+)' \n text = str(result) \n self.ui.textBrowser.append(text)\n self.ui.textBrowser.append(text)\n\n def print_sub(self):\n dicti={}\n for i in range(2):\n i = i + 1\n dicti['var{0}'.format(i)] = self.cur.execute('SELECT var{0} FROM Input'.format(i))\n dicti['var{0}'.format(i)] = int(self.cur.fetchone()[0])\n text = str(dicti['var{0}'.format(i)])\n self.ui.textBrowser.append(text)\n result = dicti['var1'] + dicti['var2']\n textsep = '-'*25+'(-)' \n text = str(result) \n self.ui.textBrowser.append(text)\n self.ui.textBrowser.append(text)\n\nif __name__=='__main__':\n app = QtGui.QApplication(sys.argv)\n myapp = ShowAndInput()\n myapp.show()\n sys.exit(app.exec_())\n</code></pre>\n\n<p>When I ran it, everything was fine. But when I pressed the calculate button, it returned an error message.\n<a href="http://i.stack.imgur.com/UybsA.png" rel="nofollow"><img src="http://i.stack.imgur.com/UybsA.png" alt="everything is fine when I run it"></a></p>\n\n<p><a href="http://i.stack.imgur.com/aYYTJ.png" rel="nofollow"><img src="http://i.stack.imgur.com/aYYTJ.png" alt="Get an error message when I click the calculate button"></a></p>\n\n<p>Here is the error message: RuntimeError: super-class <strong>init</strong>() of type Calculate was never called.</p>\n\n<p>Any respond will be much appreciated. Thanks a lot.</p>\n\n<p>Regards,</p>\n\n<p>Arnold</p>\n"""
# test = 'asfda<code>asdfadf</code>asfdasf'

In [16]:
tags = set()
tag_pattern = re.compile('.*<(?!a|img|div|blockquote|pre|ol)(.*?)>.*', re.IGNORECASE)
for row in df.iterrows():
    q_body = row[1][5]
    tags_in_q = tag_pattern.findall(q_body)
    for tag_in_q in tags_in_q:
        tags.add(tag_in_q)

In [17]:
print(tags)

{'/dd', '/Code', '/em', 'BR', '/B', '/strike', 'em', '/ol', 'hr/', 'dd', 'LI', '/h3', 'sup', 'P', 'li', '/kbd', 'h3', 'sub', 'hr /', 'br /', '/strong', 'b', 'Br', 'Br/', 'Code', 'strong', 'B', '/dl', '/blockquote', '/s', 'h1', '/li', '/OL', 'i', '/h1', 'p', '/H2', 'br', 'HR', 'br ', 'br/', '/sub', '/PRE', '/sup', 'hr', '/I', '/S', 'Br /', 'del', '/a', '/pre', '/code', '/b', '/ul', '/del', '/P', 'bR', 'H1', 'BR/', 'kbd', '/h2', '/p', '/i', '/div', 'strike', 'code', 'ul'}


In [90]:
from bs4 import BeautifulSoup
from bs4.element import Comment

# def tag_visible(element):
#     if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
#         return False
#     if isinstance(element, Comment):
#         return False
#     return True


# def text_from_html(body):
#     soup = BeautifulSoup(body, 'html.parser')
#     texts = soup.findAll(text=True)
#     visible_texts = filter(tag_visible, texts)  
#     return u" ".join(t.strip() for t in visible_texts)


content_pattern = re.compile('.*<(?:p|li|P|h1|H2|h3)>(?!<a.*?/a>)?(.+?)</?(?:p|li|P|h1|H2|h3)>.*')
texts = list()
i=0
for row in df.iterrows():
    print('-' * 110)
    print('-' * 110)
    print(i)
    q_body = row[1][5]
    print(q_body)
    print('-' * 110)
    print(extract_text_from_html(q_body))
#     texts_in_q = content_pattern.findall(q_body)
#     for text_in_q in texts_in_q:
#         print(text_in_q)
#         texts.append(text_in_q)
    i += 1
    if i == 8:
        break
    

--------------------------------------------------------------------------------------------------------------
--------------------------------------------------------------------------------------------------------------
0
<p>I am using the Photoshop's javascript API to find the fonts in a given PSD.</p>

<p>Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc.</p>

<p>This is all happening in a python program running on OSX so I guess I'm looking for one of:</p>

<ul>
<li>Some Photoshop javascript</li>
<li>A Python function</li>
<li>An OSX API that I can call from python</li>
</ul>

--------------------------------------------------------------------------------------------------------------
I am using the Photoshop's javascript API to find the fonts in a given PSD. Given a font name returned by the API, I want to find the actual physical font file that that font name corresponds to on the disc. This is all

In [None]:
# Get all tags
"""
Pseudocode

Find all tags
Go through each body
Extract necessary content and concatenate
return concatenated content
"""

# Example for each tag present in data set

all_tags = dict()

for row in raw_df[["body"]].iterrows():
    q_body = row[1][0]
    bs4_obj = BeautifulSoup(q_body)
    for tag in bs4_obj.find_all(True):
        if tag.name not in all_tags:
            all_tags[tag.name] = bs4_obj.find_all(tag.name)[0]
    
for key, val in all_tags.items():
    print('-' * 110)
    print('-' * 110)
    print(key)
    print('-' * 110)
    print(val)

In [49]:
html_tags = [tag for tag in all_tags.keys() if tag not in ['a', 'pre', 'code', 'ol', 'hr']]
html_tags

['p',
 'ul',
 'li',
 'strong',
 'em',
 'kbd',
 'br',
 'blockquote',
 'h2',
 'i',
 'b',
 'h3',
 'h1',
 'img',
 'sup',
 'sub',
 'strike',
 's',
 'del',
 'dd',
 'dl',
 'dt',
 'div']